Analysis of the performance of US-listed securities during COVID-19 Pandemic

Data Preparation

# load library
library(readr)
library(tidyverse)
library(lubridate)
library(zoo)
library(stringr)

Extract user holding from multiple files

# Extract user holding from multiple files

folder_path = "C:/stock_popularity_history"


file_list <- list.files(folder_path, full.names = TRUE)

get_data <- function(file_path) {
  file_name <- str_remove(basename(file_path), "\\..+$")
  data <- read.csv(file_path)

  data <- data %>% mutate(TICKER = file_name)

return(data)
}

data_list <- file_list %>% map_df(~ get_data(.))

user_holding <- bind_rows(data_list)


user_holding$timestamp <- as.POSIXct(user_holding$timestamp, format = "%Y-%m-%d %H:%M:%S")
user_holding$timestamp <- as.Date(user_holding$timestamp, format = "%Y-%m-%d")


user_holding <- user_holding %>% group_by(TICKER, timestamp) %>% mutate(users_holding = mean(users_holding)) %>% distinct(timestamp, TICKER, .keep_all = TRUE)

user_holding <- user_holding %>% filter(timestamp >= '2019-08-20')
write.csv(user_holding, file = "user_holding.csv", row.names = FALSE)

Data import

# Task 1 - 3
Stock_data_1 <- read.csv("Stock_data_part1.csv")
risk_free_rate <- read.csv("bill-rates-2002-2023.csv")
industry_code <- read.csv("industry_code.csv")
fama_french_factors <- read.csv("Fama_French_factors_daily.csv")
userholding <- read.csv("user_holding.csv")

Transform date format

# Make sure all the "date" are converted to ISO 8601 format
Stock_data_1$date <- as.Date(Stock_data_1$date, format = "%Y-%m-%d")

risk_free_rate$date <- as.Date(risk_free_rate$date, format = "%Y-%m-%d")

fama_french_factors$date <- as.Date(fama_french_factors$date, format = "%Y%m%d")
fama_french_factors$date <- format(fama_french_factors$date, format = "%Y-%m-%d")
fama_french_factors$date <- as.Date(fama_french_factors$date, format = "%Y-%m-%d")

userholding$timestamp <- as.Date(userholding$timestamp, format = "%Y-%m-%d")
## Inspect dataset structure:
## 'data.frame':    1938801 obs. of  17 variables:
##  $ PERMNO : int  10026 10026 10026 10026 10026 10026 10026 10026 10026 10026 ...
##  $ date   : Date, format: "2019-08-20" "2019-08-21" ...
##  $ SHRCD  : int  11 11 11 11 11 11 11 11 11 11 ...
##  $ TICKER : chr  "JJSF" "JJSF" "JJSF" "JJSF" ...
##  $ COMNAM : chr  "J & J SNACK FOODS CORP" "J & J SNACK FOODS CORP" "J & J SNACK FOODS CORP" "J & J SNACK FOODS CORP" ...
##  $ PERMCO : int  7976 7976 7976 7976 7976 7976 7976 7976 7976 7976 ...
##  $ BIDLO  : num  191 189 188 186 187 ...
##  $ ASKHI  : num  197 193 190 190 191 ...
##  $ PRC    : num  191 189 189 186 191 ...
##  $ VOL    : int  136698 101583 92198 75522 81788 85299 67790 62266 113788 93747 ...
##  $ RET    : chr  "-0.020298" "-0.009313" "-0.000158" "-0.016744" ...
##  $ BID    : num  191 189 189 186 191 ...
##  $ ASK    : num  191 189 190 186 191 ...
##  $ SHROUT : int  18841 18841 18841 18841 18841 18841 18841 18841 18841 18841 ...
##  $ OPENPRC: num  195 192 189 189 187 ...
##  $ NUMTRD : num  1903 2252 1805 1629 2070 ...
##  $ sprtrn : num  -0.007915 0.008247 -0.000506 -0.025946 0.010983 ...

Data Wrangling

1. Handle missing values and duplicate data

# Clean data
dataset1 <- Stock_data_1 %>%
  mutate(
    PERMNO = as.character(PERMNO),
    PRC = abs(PRC),
    BIDLO = ifelse(BIDLO == 0.0, NA, BIDLO),
    ASKHI = ifelse(ASKHI == 0.0, NA, ASKHI),
    PRC = ifelse(PRC == 0.0, NA, PRC),
    VOL = ifelse(VOL == -99, NA, VOL),
    RET = case_when(
      RET %in% c(-44, -55, -66, -77, -88, -99) ~ NA,  
      RET %in% c(".A", ".B", ".C", ".D") ~ NA,        
      TRUE ~ as.numeric(as.character(RET))
    ),
    ASK = ifelse(ASK - BID < 0, NA, as.numeric(as.character(ASK))),
    BID = ifelse(ASK - BID < 0, NA, as.numeric(as.character(BID))),
    NUMTRD = ifelse(NUMTRD == 99, NA, NUMTRD)
  ) %>%
  
  # Remove duplicate observations
  distinct(PERMNO, date, .keep_all = TRUE)

2. Remove low-observation securities

# Count observation for each unique security
total_count <- dataset1 %>% 
  filter(date < "2020-03-20") %>% 
  group_by(PERMNO) %>%
  summarise(Count = n(), .groups = 'drop')

summary(total_count$Count)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0   147.0   147.0   141.7   147.0   147.0

Despite the popular frequency is 147 observations, but it may reduce the sample size significantly if using 147 as the threshold, hence, I decided to use the lower bounds in COVID period as the threshold

# Define the threshold for low-observation (using the COVID period as the boundary)
obs_covid <- dataset1 %>% 
  filter(date >= "2020-02-14" & date <= "2020-03-20") %>% 
  group_by(PERMNO) %>%
  summarise(Count = n(), .groups = 'drop')
obs_covid_summary <- summary(obs_covid$Count)

summary(obs_covid$Count)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   25.00   25.00   24.78   25.00   25.00
# Calculate lower bounds for observation counts
# Formula: 25th percentile - {1.5 * (75th percentile - 25th percentile)}
obs_lower_bound_covid <- obs_covid_summary["1st Qu."] - 1.5 * (obs_covid_summary["3rd Qu."] - obs_covid_summary["1st Qu."])

# Remove low-observation securities
dataset1_obs_removed <- dataset1 %>%
  filter((PERMNO %in% total_count$PERMNO[total_count$Count >= obs_lower_bound_covid])) %>%
  ungroup()
## Number of unique security (before remove low-obs):  8183
## Number of unique security (after remove low-obs):  7846

3. Assign TICKER to those have PERMNO in the entire period but missing TICKER in some dates

TICKER may be changed due to company name changes, mergers, or acquisition, but PERMNO is the most unique identifier of each security that won’t change just due to company movement.

# Check how many security is missing TICKER value while have PERMNO value
missing_ticker <- dataset1_obs_removed %>%
  group_by(PERMNO) %>%
  summarise(missing_ticker = any(TICKER == ""), 
            all_dates = n()) %>%
  filter(missing_ticker) %>%
  pull(PERMNO)

cat("Security that have PERMNO but missing TICKER (before cleaning): ", length(missing_ticker))
## Security that have PERMNO but missing TICKER (before cleaning):  67
# Replace missing value TICKER if same PERMNO found
dataset1_cleaned <- dataset1_obs_removed %>%
  group_by(PERMNO) %>%
  mutate(TICKER = ifelse(TICKER == "", 
                          first(na.omit(TICKER[TICKER != ""])), 
                          TICKER)) %>%
  ungroup()

# Check after the replace
missing_ticker_cleaned <- dataset1_cleaned %>%
  group_by(PERMNO) %>%
  summarise(missing_ticker = any(is.na(TICKER)), 
            all_dates = n()) %>%
  filter(missing_ticker) %>%
  pull(PERMNO)

cat("Security that have PERMNO but missing TICKER (after replacing): ", length(missing_ticker_cleaned))
## Security that have PERMNO but missing TICKER (after replacing):  30

4. Add (Compute) variables for further analysis

4.1 Filter ETFs and Stocks

dataset1_filtered <- dataset1_cleaned %>%
  filter(SHRCD == 73 | SHRCD == 11) %>% 
  filter(!is.na(RET) & !is.na(PRC))

cat("Number of unique security in the dataset: ",length(unique(dataset1_filtered$PERMNO)))
## Number of unique security in the dataset:  5929

4.2 Compute dollar_vol, bid_ask_spread, turnover_ratio

dataset1_df1 <- dataset1_filtered %>%
  arrange(PERMNO, date) %>%
  mutate(
    # Compute the dollar volume
    dollar_vol = PRC * VOL,
    
    # Compute the bid-ask spread (relative)
    bid_ask_spread = (ASK - BID) / ((ASK + BID) / 2),
    
    # Compute the turnover ratio
    turnover_ratio = VOL / (SHROUT * 1000)
  )

4.3 Add risk-free rate

# Add Risk free rate (US Daily Treasury Bill Rates)
dataset1_df2 <- dataset1_df1 %>%
  arrange(date) %>%
  left_join(
    risk_free_rate %>% 
      filter(
        date >= "2019-08-20" & date <= "2020-08-20"
      )
    , by = "date"
  ) %>%
  mutate(
    risk_free_rate = na.locf(risk_free_rate, na.rm = FALSE)
  )

4.4 Compute Sharpe ratio and volatility

# Define the total trading days
trading_days_total <- length(unique(dataset1$date))

dataset1_df3 <- dataset1_df2 %>%
  group_by(PERMNO) %>%
  arrange(date) %>%
  mutate(
    # Compute daily mean return and standard deviation for each PERMNO
    mean_ret = rollmean(RET, k = 30, fill = NA, align = "right", na.rm = TRUE),
    std_ret = rollapply(RET, width = 30, FUN = sd, fill = NA, align = "right", na.rm = TRUE),
    
    # Compute volatility
    volatility = std_ret * sqrt(trading_days_total),
    
    # Compute daily Sharpe Ratio
    sharpe_ratio = ifelse(is.na(std_ret) | std_ret == 0, NA, (mean_ret - risk_free_rate) / std_ret)
    ) %>%
  ungroup() %>% 
  select(-c("mean_ret", "std_ret")) 

4.5 Add sector categories

# Add sector ID
dataset1_df4 <- dataset1_df3 %>%
  left_join(industry_code, by = "TICKER") %>%
  mutate(
    SECTOR = case_when(
      SECTOR == "Health Care" ~ "Health_Care",
      SECTOR == "Information Technology" ~ "Information_Technology",
      SECTOR == "Consumer Discretionary" ~ "Consumer_Discretionary",
      SECTOR == "Consumer Staples" ~ "Consumer_Staples",
      SECTOR == "Real Estate" ~ "Real_Estate",
      SECTOR == "Communication Services" ~ "Communication_Services",
      TRUE ~ SECTOR  
    ),
    # Assign all missing value as Unknown Sector to avoid dropping large amount of data
    SECTOR = ifelse(is.na(SECTOR), "Unknown", SECTOR)
  ) %>% 
  select(-c(SECTOR_ID, INDUSTRY))

4.6 Add market share

# Compute the sum of user holding of all securities
user_holding_summary <- userholding %>%
  group_by(timestamp) %>%
  summarise(total_holding = sum(users_holding, na.rm = TRUE), .groups = 'drop')

# Add total user holdings to the original dataset
user_holding_1 <- userholding %>%
  left_join(user_holding_summary, by = "timestamp")

# Compute market share
user_holding_2 <- user_holding_1 %>%
  group_by(TICKER, timestamp) %>% 
  mutate(
    market_share = users_holding / total_holding * 100) 
# Add the market_share to the dataset
dataset1_df5 <- dataset1_df4 %>%
  left_join(user_holding_2, by = c("TICKER" = "TICKER", "date" = "timestamp")) %>% 
  select(-c("users_holding", "total_holding"))

4.7 Add market capitalisation and catogrise company size

dataset1_df6 <- dataset1_df5 %>%
  filter(!is.na(SHROUT)) %>% 
   # Compute market capitalisation
  mutate(
    market_cap = SHROUT * PRC * 1000,
    #  Categorise company size for the security based on market capitalisation
    company_size = case_when(
      market_cap <= 2e9 ~ "Small",
      market_cap > 2e9 & market_cap <= 1e10 ~ "Medium",
      market_cap > 1e10 ~ "Large"
    )
  )

# Inspect the unique security count of each company size
print(dataset1_df6 %>%
  group_by(company_size) %>%
  summarise(distinct_tickers = n_distinct(PERMNO)))
## # A tibble: 3 × 2
##   company_size distinct_tickers
##   <chr>                   <int>
## 1 Large                     648
## 2 Medium                   1331
## 3 Small                    4843

4.8 Add absolute correlation between RET and sprtrn

dataset1_df7 <- dataset1_df6 %>%
  group_by(PERMNO) %>%
  mutate(
    abs_corr_sp = abs(cor(RET, sprtrn, use = "complete.obs")))

4.9 Add dummy variables to identify ETFs or stocks

dataset1_df8 <- dataset1_df7 %>%
  mutate(security_type = case_when(
      SHRCD == 73 ~ "ETF",
      SHRCD == 11 ~ "Stock")
      )

4.10 Tracking error (ETF)

To see how closely an investment portfolio’s returns align with the return of a benchmark index. The standard deviation of the difference between returns and benchmark returns.

# Tracking error
dataset1_df9 <- dataset1_df8 %>% 
  group_by(PERMNO) %>% 
  mutate(
    tracking_error = ifelse(is.na(sprtrn), NA, sd(RET - sprtrn))
  )

Inspect the data

To make sure no PERMNO accidentally dropped during data wrangling

## Number of unique security before data wrangling:  5929
## Number of unique security after data wrangling:  5929
## No unique security dropped during data wrangling:  TRUE
## Preview of cleaned dataset:
## # A tibble: 5 × 30
## # Groups:   PERMNO [5]
##   PERMNO date       SHRCD TICKER COMNAM       PERMCO  BIDLO  ASKHI    PRC    VOL
##   <chr>  <date>     <int> <chr>  <chr>         <int>  <dbl>  <dbl>  <dbl>  <int>
## 1 10026  2019-08-20    11 JJSF   J & J SNACK…   7976 191.   197.   191.   136698
## 2 10028  2019-08-20    11 DGSE   D G S E COM…   7978   0.77   1.05   0.88  91081
## 3 10032  2019-08-20    11 PLXS   PLEXUS CORP    7980  56.9   57.5   57.1  125933
## 4 10044  2019-08-20    11 RMCF   ROCKY MOUNT…   7992   9.16   9.35   9.34   6558
## 5 10051  2019-08-20    11 HNGR   HANGER INC     7999  18.9   19.3   18.9  206241
## # ℹ 20 more variables: RET <dbl>, BID <dbl>, ASK <dbl>, SHROUT <int>,
## #   OPENPRC <dbl>, NUMTRD <dbl>, sprtrn <dbl>, dollar_vol <dbl>,
## #   bid_ask_spread <dbl>, turnover_ratio <dbl>, risk_free_rate <dbl>,
## #   volatility <dbl>, sharpe_ratio <dbl>, SECTOR <chr>, market_share <dbl>,
## #   market_cap <dbl>, company_size <chr>, abs_corr_sp <dbl>,
## #   security_type <chr>, tracking_error <dbl>
## Summary of dataset:
##     PERMNO               date                SHRCD          TICKER         
##  Length:1445273     Min.   :2019-08-20   Min.   :11.00   Length:1445273    
##  Class :character   1st Qu.:2019-11-18   1st Qu.:11.00   Class :character  
##  Mode  :character   Median :2020-02-19   Median :11.00   Mode  :character  
##                     Mean   :2020-02-18   Mean   :34.06                     
##                     3rd Qu.:2020-05-20   3rd Qu.:73.00                     
##                     Max.   :2020-08-20   Max.   :73.00                     
##                                                                            
##     COMNAM              PERMCO          BIDLO              ASKHI         
##  Length:1445273     Min.   :    5   Min.   :     0.0   Min.   :     0.0  
##  Class :character   1st Qu.:27739   1st Qu.:    10.9   1st Qu.:    11.5  
##  Mode  :character   Median :51451   Median :    25.5   Median :    25.9  
##                     Mean   :41071   Mean   :    97.3   Mean   :    99.4  
##                     3rd Qu.:54933   3rd Qu.:    49.4   3rd Qu.:    50.3  
##                     Max.   :57031   Max.   :344550.0   Max.   :347400.0  
##                                                                          
##       PRC                VOL                 RET                 BID          
##  Min.   :     0.0   Min.   :        0   Min.   :-0.862893   Min.   :     0.0  
##  1st Qu.:    11.2   1st Qu.:    14900   1st Qu.:-0.011494   1st Qu.:    11.1  
##  Median :    25.7   Median :   138072   Median : 0.000372   Median :    25.6  
##  Mean   :    98.3   Mean   :  1266886   Mean   : 0.000953   Mean   :    98.3  
##  3rd Qu.:    49.9   3rd Qu.:   682045   3rd Qu.: 0.011945   3rd Qu.:    49.9  
##  Max.   :344970.0   Max.   :990919210   Max.   :10.251815   Max.   :345140.0  
##                                                             NA's   :53        
##       ASK               SHROUT           OPENPRC             NUMTRD       
##  Min.   :     0.0   Min.   :      9   Min.   :     0.0   Min.   :      0  
##  1st Qu.:    11.2   1st Qu.:   4627   1st Qu.:    11.1   1st Qu.:    204  
##  Median :    25.7   Median :  23750   Median :    25.8   Median :   1294  
##  Mean   :    98.5   Mean   :  92374   Mean   :    99.9   Mean   :   6339  
##  3rd Qu.:    50.0   3rd Qu.:  65639   3rd Qu.:    50.3   3rd Qu.:   4916  
##  Max.   :345589.0   Max.   :9308301   Max.   :345000.0   Max.   :1640000  
##  NA's   :53                           NA's   :32427      NA's   :793232   
##      sprtrn             dollar_vol        bid_ask_spread    turnover_ratio    
##  Min.   :-0.1198410   Min.   :0.000e+00   Min.   :0.00000   Min.   :  0.0000  
##  1st Qu.:-0.0040110   1st Qu.:2.145e+05   1st Qu.:0.00056   1st Qu.:  0.0025  
##  Median : 0.0018440   Median :2.234e+06   Median :0.00155   Median :  0.0058  
##  Mean   : 0.0007867   Mean   :6.393e+07   Mean   :0.00592   Mean   :  0.0230  
##  3rd Qu.: 0.0076950   3rd Qu.:1.934e+07   3rd Qu.:0.00456   3rd Qu.:  0.0126  
##  Max.   : 0.0938280   Max.   :1.143e+11   Max.   :1.98333   Max.   :770.2883  
##                                           NA's   :53                          
##  risk_free_rate    volatility      sharpe_ratio          SECTOR         
##  Min.   :0.110   Min.   : 0.00    Min.   :-38267.36   Length:1445273    
##  1st Qu.:0.170   1st Qu.: 0.18    1st Qu.:  -101.84   Class :character  
##  Median :1.420   Median : 0.39    Median :   -20.06   Mode  :character  
##  Mean   :0.909   Mean   : 0.55    Mean   :  -114.36                     
##  3rd Qu.:1.530   3rd Qu.: 0.75    3rd Qu.:    -4.18                     
##  Max.   :1.830   Max.   :30.07    Max.   :     0.10                     
##                  NA's   :171888   NA's   :171888                        
##   market_share      market_cap        company_size        abs_corr_sp       
##  Min.   :0.0      Min.   :2.970e+04   Length:1445273     Min.   :0.0008779  
##  1st Qu.:0.0      1st Qu.:5.880e+07   Class :character   1st Qu.:0.3884390  
##  Median :0.0      Median :3.302e+08   Mode  :character   Median :0.6248121  
##  Mean   :0.0      Mean   :5.992e+09                      Mean   :0.5898372  
##  3rd Qu.:0.0      3rd Qu.:1.971e+09                      3rd Qu.:0.8117317  
##  Max.   :2.3      Max.   :2.023e+12                      Max.   :1.0000000  
##  NA's   :800157                                                             
##  security_type      tracking_error     
##  Length:1445273     Min.   :0.0007778  
##  Class :character   1st Qu.:0.0158461  
##  Mode  :character   Median :0.0261979  
##                     Mean   :0.0348746  
##                     3rd Qu.:0.0441767  
##                     Max.   :0.6425179  
## 

5. Create function to save the plots

# To reduce the workload of manually saving every plot, create a function to save plot
save_plot <- function(plot, title) {
  # Define the file path and name
  file_name <- paste0(title, ".png")
  file_path <- file.path(getwd(), file_name)
  
  # Define the format
  ggsave(filename = file_path, 
         plot = plot, 
         width = 12, 
         height = 6, 
         units = "in", 
         dpi = 600)
  }

1 Descriptive statistics & visualisation

# load library
library(e1071)
library(purrr)
library(ggplot2)
library(RColorBrewer)
library(plotly)
library(scales)

Descriptive Statistics

Overall

Transform the dataset into a time series dataset:

# Group by date and summarise for each variable by aggregation
task1_overall <- dataset1_df9 %>% 
  group_by(date)  %>%
  summarise(
    RET = mean(RET, na.rm = TRUE),
    volatility = mean(volatility, na.rm = TRUE),
    sharpe_ratio = mean(sharpe_ratio, na.rm = TRUE),
    dollar_vol = sum(dollar_vol, na.rm = TRUE),
    number_of_trade = sum(NUMTRD, na.rm = TRUE),
    bid_ask_spread = mean(bid_ask_spread, na.rm = TRUE),
    turnover_ratio = mean(turnover_ratio, na.rm = TRUE),
    .groups = "drop"
  ) %>% 
  # Standardise variable to ensure they are in same scale
  mutate(
    scaled_RET = scale(RET),
    scaled_sharpe_ratio = scale(sharpe_ratio),
    scaled_volatility = scale(volatility),
    scaled_spread = scale(bid_ask_spread),
    scaled_dollar_vol = scale(dollar_vol),
    scaled_numtrd = scale(number_of_trade),
    scaled_turnover_ratio = scale(turnover_ratio)
  )

head(task1_overall, n = 5)
## # A tibble: 5 × 15
##   date            RET volatility sharpe_ratio    dollar_vol number_of_trade
##   <date>        <dbl>      <dbl>        <dbl>         <dbl>           <dbl>
## 1 2019-08-20 -0.00390        NaN          NaN 220253155332.         9213736
## 2 2019-08-21  0.00755        NaN          NaN 218034206986.         9096519
## 3 2019-08-22 -0.00264        NaN          NaN 223328802822.         9374476
## 4 2019-08-23 -0.0222         NaN          NaN 359904889558.        12753999
## 5 2019-08-26  0.00829        NaN          NaN 232556980779.         9642102
## # ℹ 9 more variables: bid_ask_spread <dbl>, turnover_ratio <dbl>,
## #   scaled_RET <dbl[,1]>, scaled_sharpe_ratio <dbl[,1]>,
## #   scaled_volatility <dbl[,1]>, scaled_spread <dbl[,1]>,
## #   scaled_dollar_vol <dbl[,1]>, scaled_numtrd <dbl[,1]>,
## #   scaled_turnover_ratio <dbl[,1]>

Create the descriptive statistics summary for all variables for both period:

# Non-COVID period
task1_overall_noncovid <- task1_overall %>% 
  filter(date >= "2019-10-14" & date <= "2019-11-20") %>%
  # Exclude the first 29 days' observation to avoid the 30-day rolling window affect the summary statistics result
  filter(date >= "2019-10-01") %>%  
  select(-date, -starts_with("scaled_")) %>% 
  summarise(across(everything(), list(
    min = ~ min(.x, na.rm = TRUE),
    q25 = ~ quantile(.x, 0.25, na.rm = TRUE),
    mean = ~ mean(.x, na.rm = TRUE),
    median = ~ median(.x, na.rm = TRUE),
    q75 = ~ quantile(.x, 0.75, na.rm = TRUE),
    max = ~ max(.x, na.rm = TRUE),
    sd = ~ sd(.x, na.rm = TRUE)
  ),
  .names = "{.col}_{.fn}")) 

# COVID period
task1_overall_covid <- task1_overall %>% 
  filter(date >= "2020-02-14" & date <= "2020-03-20") %>% 
  # Exclude the first 29 days' observation to avoid the 30-day rolling window affect the summary statistics result
  filter(date >= "2019-10-01") %>%  
  select(-date, -starts_with("scaled_")) %>% 
  summarise(across(everything(), list(
    min = ~ min(.x, na.rm = TRUE),
    q25 = ~ quantile(.x, 0.25, na.rm = TRUE),
    mean = ~ mean(.x, na.rm = TRUE),
    median = ~ median(.x, na.rm = TRUE),
    q75 = ~ quantile(.x, 0.75, na.rm = TRUE),
    max = ~ max(.x, na.rm = TRUE),
    sd = ~ sd(.x, na.rm = TRUE)
  ),
  .names = "{.col}_{.fn}"))

print(task1_overall_noncovid)
## # A tibble: 1 × 49
##    RET_min  RET_q25 RET_mean RET_median RET_q75 RET_max  RET_sd volatility_min
##      <dbl>    <dbl>    <dbl>      <dbl>   <dbl>   <dbl>   <dbl>          <dbl>
## 1 -0.00508 -0.00207  0.00116   0.000432 0.00377  0.0139 0.00452          0.326
## # ℹ 41 more variables: volatility_q25 <dbl>, volatility_mean <dbl>,
## #   volatility_median <dbl>, volatility_q75 <dbl>, volatility_max <dbl>,
## #   volatility_sd <dbl>, sharpe_ratio_min <dbl>, sharpe_ratio_q25 <dbl>,
## #   sharpe_ratio_mean <dbl>, sharpe_ratio_median <dbl>, sharpe_ratio_q75 <dbl>,
## #   sharpe_ratio_max <dbl>, sharpe_ratio_sd <dbl>, dollar_vol_min <dbl>,
## #   dollar_vol_q25 <dbl>, dollar_vol_mean <dbl>, dollar_vol_median <dbl>,
## #   dollar_vol_q75 <dbl>, dollar_vol_max <dbl>, dollar_vol_sd <dbl>, …
print(task1_overall_covid)
## # A tibble: 1 × 49
##   RET_min RET_q25 RET_mean RET_median RET_q75 RET_max RET_sd volatility_min
##     <dbl>   <dbl>    <dbl>      <dbl>   <dbl>   <dbl>  <dbl>          <dbl>
## 1  -0.106 -0.0282  -0.0151   -0.00916 0.00576  0.0567 0.0431          0.351
## # ℹ 41 more variables: volatility_q25 <dbl>, volatility_mean <dbl>,
## #   volatility_median <dbl>, volatility_q75 <dbl>, volatility_max <dbl>,
## #   volatility_sd <dbl>, sharpe_ratio_min <dbl>, sharpe_ratio_q25 <dbl>,
## #   sharpe_ratio_mean <dbl>, sharpe_ratio_median <dbl>, sharpe_ratio_q75 <dbl>,
## #   sharpe_ratio_max <dbl>, sharpe_ratio_sd <dbl>, dollar_vol_min <dbl>,
## #   dollar_vol_q25 <dbl>, dollar_vol_mean <dbl>, dollar_vol_median <dbl>,
## #   dollar_vol_q75 <dbl>, dollar_vol_max <dbl>, dollar_vol_sd <dbl>, …

By Industry Sector

Transform the dataset into a panel (Date & Industry)

# Group by date and SECTOR, summarise for each variable by aggregation
task1_sector <- dataset1_df9 %>% 
  group_by(SECTOR, date) %>%
  summarise(
    RET = mean(RET, na.rm = TRUE),
    volatility = mean(volatility, na.rm = TRUE),
    sharpe_ratio = mean(sharpe_ratio, na.rm = TRUE),
    dollar_vol = sum(dollar_vol, na.rm = TRUE),
    number_of_trade = sum(NUMTRD, na.rm = TRUE),
    bid_ask_spread = mean(bid_ask_spread, na.rm = TRUE),
    turnover_ratio = mean(turnover_ratio, na.rm = TRUE),
    .groups = "drop"
  ) %>% 
  # Standardise variable to ensure they are in same scale
  mutate(
    scaled_RET = scale(RET),
    scaled_sharpe_ratio = scale(sharpe_ratio),
    scaled_volatility = scale(volatility),
    scaled_spread = scale(bid_ask_spread),
    scaled_dollar_vol = scale(dollar_vol),
    scaled_numtrd = scale(number_of_trade),
    scaled_turnover_ratio = scale(turnover_ratio)
  ) 

head(task1_sector, n = 5)
## # A tibble: 5 × 16
##   SECTOR  date            RET volatility sharpe_ratio dollar_vol number_of_trade
##   <chr>   <date>        <dbl>      <dbl>        <dbl>      <dbl>           <dbl>
## 1 Commun… 2019-08-20 -0.00360        NaN          NaN    1.19e10          601204
## 2 Commun… 2019-08-21  0.00552        NaN          NaN    1.04e10          597822
## 3 Commun… 2019-08-22 -0.00377        NaN          NaN    1.04e10          568559
## 4 Commun… 2019-08-23 -0.0238         NaN          NaN    1.52e10          776119
## 5 Commun… 2019-08-26  0.0118         NaN          NaN    1.20e10          620346
## # ℹ 9 more variables: bid_ask_spread <dbl>, turnover_ratio <dbl>,
## #   scaled_RET <dbl[,1]>, scaled_sharpe_ratio <dbl[,1]>,
## #   scaled_volatility <dbl[,1]>, scaled_spread <dbl[,1]>,
## #   scaled_dollar_vol <dbl[,1]>, scaled_numtrd <dbl[,1]>,
## #   scaled_turnover_ratio <dbl[,1]>

Create the descriptive statistics summary for all variables for both period

# Non-COVID period
task1_sector_noncovid <- task1_sector %>% 
  filter(date >= "2019-10-14" & date <= "2019-11-20") %>% 
  # Exclude the first 29 days' observation to avoid the 30-day rolling window affect the summary statistics result
  filter(date >= "2019-10-01") %>%  
  select(-date, -starts_with("scaled_")) %>% 
  group_by(SECTOR) %>%
  summarise(across(everything(), list(
    min = ~ min(.x, na.rm = TRUE),
    q25 = ~ quantile(.x, 0.25, na.rm = TRUE),
    mean = ~ mean(.x, na.rm = TRUE),
    median = ~ median(.x, na.rm = TRUE),
    q75 = ~ quantile(.x, 0.75, na.rm = TRUE),
    max = ~ max(.x, na.rm = TRUE),
    sd = ~ sd(.x, na.rm = TRUE)
  ),
  .names = "{.col}_{.fn}")) %>% 
  arrange(SECTOR)

# COVID period
task1_sector_covid <- task1_sector %>% 
  filter(date >= "2020-02-14" & date <= "2020-03-20") %>% 
  # Exclude the first 29 days' observation to avoid the 30-day rolling window affect the summary statistics result
  filter(date >= "2019-10-01") %>%  
  select(-date, -starts_with("scaled_")) %>% 
  group_by(SECTOR) %>%
  summarise(across(everything(), list(
    min = ~ min(.x, na.rm = TRUE),
    q25 = ~ quantile(.x, 0.25, na.rm = TRUE),
    mean = ~ mean(.x, na.rm = TRUE),
    median = ~ median(.x, na.rm = TRUE),
    q75 = ~ quantile(.x, 0.75, na.rm = TRUE),
    max = ~ max(.x, na.rm = TRUE),
    sd = ~ sd(.x, na.rm = TRUE)
  ),
  .names = "{.col}_{.fn}")) %>% 
  arrange(SECTOR)

print(task1_sector_noncovid)
## # A tibble: 12 × 50
##    SECTOR           RET_min  RET_q25 RET_mean RET_median RET_q75 RET_max  RET_sd
##    <chr>              <dbl>    <dbl>    <dbl>      <dbl>   <dbl>   <dbl>   <dbl>
##  1 Communication_… -0.0103  -0.00402  1.42e-3  0.00216   0.00703 0.0160  0.00732
##  2 Consumer_Discr… -0.0126  -0.00294  1.33e-3  0.00235   0.00547 0.0128  0.00608
##  3 Consumer_Stapl… -0.00737 -0.00114  3.92e-4  0.000184  0.00192 0.00966 0.00414
##  4 Energy          -0.0306  -0.0121  -1.92e-3 -0.00265   0.00589 0.0435  0.0175 
##  5 Financials      -0.00650 -0.00141  1.93e-3  0.00161   0.00504 0.0122  0.00467
##  6 Health_Care     -0.0140  -0.00291  2.97e-3  0.00202   0.00645 0.0255  0.00930
##  7 Industrials     -0.00984 -0.00271  8.61e-4  0.0000116 0.00380 0.0198  0.00650
##  8 Information_Te… -0.0101  -0.00320  1.58e-3  0.00229   0.00701 0.0201  0.00710
##  9 Materials       -0.0179  -0.00396  1.28e-4 -0.000200  0.00463 0.0261  0.00842
## 10 Real_Estate     -0.0109  -0.00211  1.42e-3  0.00212   0.00401 0.0167  0.00596
## 11 Unknown         -0.00403 -0.00146  9.18e-4  0.000280  0.00355 0.0111  0.00364
## 12 Utilities       -0.0169  -0.00443 -7.81e-4  0.00158   0.00304 0.0112  0.00690
## # ℹ 42 more variables: volatility_min <dbl>, volatility_q25 <dbl>,
## #   volatility_mean <dbl>, volatility_median <dbl>, volatility_q75 <dbl>,
## #   volatility_max <dbl>, volatility_sd <dbl>, sharpe_ratio_min <dbl>,
## #   sharpe_ratio_q25 <dbl>, sharpe_ratio_mean <dbl>, sharpe_ratio_median <dbl>,
## #   sharpe_ratio_q75 <dbl>, sharpe_ratio_max <dbl>, sharpe_ratio_sd <dbl>,
## #   dollar_vol_min <dbl>, dollar_vol_q25 <dbl>, dollar_vol_mean <dbl>,
## #   dollar_vol_median <dbl>, dollar_vol_q75 <dbl>, dollar_vol_max <dbl>, …
print(task1_sector_covid)
## # A tibble: 12 × 50
##    SECTOR             RET_min RET_q25 RET_mean RET_median RET_q75 RET_max RET_sd
##    <chr>                <dbl>   <dbl>    <dbl>      <dbl>   <dbl>   <dbl>  <dbl>
##  1 Communication_Ser… -0.119  -0.0347  -0.0199   -0.0225  6.19e-3  0.0998 0.0523
##  2 Consumer_Discreti… -0.156  -0.0395  -0.0242   -0.0198  6.93e-3  0.0908 0.0560
##  3 Consumer_Staples   -0.0965 -0.0242  -0.0103   -0.0115  8.52e-4  0.0648 0.0408
##  4 Energy             -0.241  -0.0494  -0.0259   -0.0157  6.57e-3  0.179  0.0808
##  5 Financials         -0.121  -0.0365  -0.0171   -0.0217  4.18e-3  0.0858 0.0539
##  6 Health_Care        -0.125  -0.0209  -0.0107   -0.00270 1.34e-2  0.0852 0.0487
##  7 Industrials        -0.115  -0.0367  -0.0192   -0.0162  5.69e-3  0.0609 0.0490
##  8 Information_Techn… -0.120  -0.0320  -0.0167   -0.0187  1.44e-2  0.0689 0.0489
##  9 Materials          -0.116  -0.0405  -0.0170   -0.00972 6.92e-3  0.0702 0.0509
## 10 Real_Estate        -0.152  -0.0388  -0.0208   -0.0143  9.39e-3  0.106  0.0560
## 11 Unknown            -0.0917 -0.0241  -0.0135   -0.00668 3.98e-3  0.0493 0.0368
## 12 Utilities          -0.119  -0.0406  -0.0124   -0.0122  2.71e-3  0.138  0.0538
## # ℹ 42 more variables: volatility_min <dbl>, volatility_q25 <dbl>,
## #   volatility_mean <dbl>, volatility_median <dbl>, volatility_q75 <dbl>,
## #   volatility_max <dbl>, volatility_sd <dbl>, sharpe_ratio_min <dbl>,
## #   sharpe_ratio_q25 <dbl>, sharpe_ratio_mean <dbl>, sharpe_ratio_median <dbl>,
## #   sharpe_ratio_q75 <dbl>, sharpe_ratio_max <dbl>, sharpe_ratio_sd <dbl>,
## #   dollar_vol_min <dbl>, dollar_vol_q25 <dbl>, dollar_vol_mean <dbl>,
## #   dollar_vol_median <dbl>, dollar_vol_q75 <dbl>, dollar_vol_max <dbl>, …

By Company size

Transform the dataset into a panel (Date & Company Size)

# Group by date and company_size, summarise for each variable by aggregation
task1_size <- dataset1_df9 %>% 
  group_by(company_size, date) %>%
  summarise(
    RET = mean(RET, na.rm = TRUE),
    volatility = mean(volatility, na.rm = TRUE),
    sharpe_ratio = mean(sharpe_ratio, na.rm = TRUE),
    dollar_vol = sum(dollar_vol, na.rm = TRUE),
    number_of_trade = sum(NUMTRD, na.rm = TRUE),
    bid_ask_spread = mean(bid_ask_spread, na.rm = TRUE),
    turnover_ratio = mean(turnover_ratio, na.rm = TRUE),
    .groups = "drop"
  ) %>% 
  # Standardise variable to ensure they are in same scale
  mutate(
    scaled_RET = scale(RET),
    scaled_sharpe_ratio = scale(sharpe_ratio),
    scaled_volatility = scale(volatility),
    scaled_spread = scale(bid_ask_spread),
    scaled_dollar_vol = scale(dollar_vol),
    scaled_numtrd = scale(number_of_trade),
    scaled_turnover_ratio = scale(turnover_ratio)
  ) %>% 
  arrange(desc(company_size), date)

head(task1_size, n = 5)
## # A tibble: 5 × 16
##   company_size date            RET volatility sharpe_ratio   dollar_vol
##   <chr>        <date>        <dbl>      <dbl>        <dbl>        <dbl>
## 1 Small        2019-08-20 -0.00316        NaN          NaN 18029134547.
## 2 Small        2019-08-21  0.00723        NaN          NaN 18198242694.
## 3 Small        2019-08-22 -0.00332        NaN          NaN 19050070813.
## 4 Small        2019-08-23 -0.0212         NaN          NaN 30848625441.
## 5 Small        2019-08-26  0.00824        NaN          NaN 19840213213.
## # ℹ 10 more variables: number_of_trade <dbl>, bid_ask_spread <dbl>,
## #   turnover_ratio <dbl>, scaled_RET <dbl[,1]>, scaled_sharpe_ratio <dbl[,1]>,
## #   scaled_volatility <dbl[,1]>, scaled_spread <dbl[,1]>,
## #   scaled_dollar_vol <dbl[,1]>, scaled_numtrd <dbl[,1]>,
## #   scaled_turnover_ratio <dbl[,1]>

Create the descriptive statistics summary for all variables for both period

# Non-COVID period
task1_size_noncovid <- task1_size %>% 
  filter(date >= "2019-10-14" & date <= "2019-11-20") %>% 
  # Exclude the first 29 days' observation to avoid the 30-day rolling window affect the summary statistics result
  select(-date, -starts_with("scaled_")) %>% 
  group_by(company_size) %>%
  summarise(across(everything(), list(
    min = ~ min(.x, na.rm = TRUE),
    q25 = ~ quantile(.x, 0.25, na.rm = TRUE),
    mean = ~ mean(.x, na.rm = TRUE),
    median = ~ median(.x, na.rm = TRUE),
    q75 = ~ quantile(.x, 0.75, na.rm = TRUE),
    max = ~ max(.x, na.rm = TRUE),
    sd = ~ sd(.x, na.rm = TRUE)
  ),
  .names = "{.col}_{.fn}")) %>% 
  arrange(desc(company_size))

# COVID period
task1_size_covid <- task1_size %>% 
  # Exclude the first 29 days' observation to avoid the 30-day rolling window affect the summary statistics result
  filter(date >= "2020-02-14" & date <= "2020-03-20") %>% 
  select(-date, -starts_with("scaled_")) %>% 
  group_by(company_size) %>%
  summarise(across(everything(), list(
    min = ~ min(.x, na.rm = TRUE),
    q25 = ~ quantile(.x, 0.25, na.rm = TRUE),
    mean = ~ mean(.x, na.rm = TRUE),
    median = ~ median(.x, na.rm = TRUE),
    q75 = ~ quantile(.x, 0.75, na.rm = TRUE),
    max = ~ max(.x, na.rm = TRUE),
    sd = ~ sd(.x, na.rm = TRUE)
  ),
  .names = "{.col}_{.fn}")) %>% 
  arrange(desc(company_size))

print(task1_size_noncovid)
## # A tibble: 3 × 50
##   company_size  RET_min   RET_q25 RET_mean RET_median RET_q75 RET_max  RET_sd
##   <chr>           <dbl>     <dbl>    <dbl>      <dbl>   <dbl>   <dbl>   <dbl>
## 1 Small        -0.00574 -0.00267  0.000853   0.000216 0.00326  0.0141 0.00476
## 2 Medium       -0.00611 -0.00146  0.00221    0.00152  0.00567  0.0149 0.00479
## 3 Large        -0.00495 -0.000924 0.00186    0.00163  0.00369  0.0109 0.00361
## # ℹ 42 more variables: volatility_min <dbl>, volatility_q25 <dbl>,
## #   volatility_mean <dbl>, volatility_median <dbl>, volatility_q75 <dbl>,
## #   volatility_max <dbl>, volatility_sd <dbl>, sharpe_ratio_min <dbl>,
## #   sharpe_ratio_q25 <dbl>, sharpe_ratio_mean <dbl>, sharpe_ratio_median <dbl>,
## #   sharpe_ratio_q75 <dbl>, sharpe_ratio_max <dbl>, sharpe_ratio_sd <dbl>,
## #   dollar_vol_min <dbl>, dollar_vol_q25 <dbl>, dollar_vol_mean <dbl>,
## #   dollar_vol_median <dbl>, dollar_vol_q75 <dbl>, dollar_vol_max <dbl>, …
print(task1_size_covid)
## # A tibble: 3 × 50
##   company_size RET_min RET_q25 RET_mean RET_median RET_q75 RET_max RET_sd
##   <chr>          <dbl>   <dbl>    <dbl>      <dbl>   <dbl>   <dbl>  <dbl>
## 1 Small         -0.103 -0.0267  -0.0151   -0.00893 0.00569  0.0573 0.0424
## 2 Medium        -0.120 -0.0324  -0.0161   -0.0113  0.00661  0.0739 0.0474
## 3 Large         -0.110 -0.0350  -0.0138   -0.0101  0.00483  0.0776 0.0447
## # ℹ 42 more variables: volatility_min <dbl>, volatility_q25 <dbl>,
## #   volatility_mean <dbl>, volatility_median <dbl>, volatility_q75 <dbl>,
## #   volatility_max <dbl>, volatility_sd <dbl>, sharpe_ratio_min <dbl>,
## #   sharpe_ratio_q25 <dbl>, sharpe_ratio_mean <dbl>, sharpe_ratio_median <dbl>,
## #   sharpe_ratio_q75 <dbl>, sharpe_ratio_max <dbl>, sharpe_ratio_sd <dbl>,
## #   dollar_vol_min <dbl>, dollar_vol_q25 <dbl>, dollar_vol_mean <dbl>,
## #   dollar_vol_median <dbl>, dollar_vol_q75 <dbl>, dollar_vol_max <dbl>, …

By Security Types

Transform the dataset into a panel (Date & Security Type)

# Group by date and security_type, summarise for each variable by aggregation
task1_type <- dataset1_df9 %>% 
  group_by(security_type, date) %>%
  summarise(
    RET = mean(RET, na.rm = TRUE),
    volatility = mean(volatility, na.rm = TRUE),
    sharpe_ratio = mean(sharpe_ratio, na.rm = TRUE),
    dollar_vol = sum(dollar_vol, na.rm = TRUE),
    number_of_trade = sum(NUMTRD, na.rm = TRUE),
    bid_ask_spread = mean(bid_ask_spread, na.rm = TRUE),
    turnover_ratio = mean(turnover_ratio, na.rm = TRUE),
    .groups = "drop"
  ) %>% 
  # Standardise variable to ensure they are in same scale
  mutate(
    scaled_RET = scale(RET),
    scaled_sharpe_ratio = scale(sharpe_ratio),
    scaled_volatility = scale(volatility),
    scaled_spread = scale(bid_ask_spread),
    scaled_dollar_vol = scale(dollar_vol),
    scaled_numtrd = scale(number_of_trade),
    scaled_turnover_ratio = scale(turnover_ratio)
  ) %>% arrange(desc(security_type), date)

head(task1_type, n = 5)
## # A tibble: 5 × 16
##   security_type date            RET volatility sharpe_ratio    dollar_vol
##   <chr>         <date>        <dbl>      <dbl>        <dbl>         <dbl>
## 1 Stock         2019-08-20 -0.00465        NaN          NaN 155680282032.
## 2 Stock         2019-08-21  0.00875        NaN          NaN 152342708836.
## 3 Stock         2019-08-22 -0.00303        NaN          NaN 154016031104.
## 4 Stock         2019-08-23 -0.0268         NaN          NaN 215699996631.
## 5 Stock         2019-08-26  0.0101         NaN          NaN 150830388226.
## # ℹ 10 more variables: number_of_trade <dbl>, bid_ask_spread <dbl>,
## #   turnover_ratio <dbl>, scaled_RET <dbl[,1]>, scaled_sharpe_ratio <dbl[,1]>,
## #   scaled_volatility <dbl[,1]>, scaled_spread <dbl[,1]>,
## #   scaled_dollar_vol <dbl[,1]>, scaled_numtrd <dbl[,1]>,
## #   scaled_turnover_ratio <dbl[,1]>

Create the descriptive statistics summary for all variables for both period

# Non-COVID period
task1_type_noncovid <- task1_type %>% 
  filter(date >= "2019-10-14" & date <= "2019-11-20") %>% 
  # Exclude the first 29 days' observation to avoid the 30-day rolling window affect the summary statistics result
  select(-date, -starts_with("scaled_")) %>% 
  group_by(security_type) %>%
  summarise(across(everything(), list(
    min = ~ min(.x, na.rm = TRUE),
    q25 = ~ quantile(.x, 0.25, na.rm = TRUE),
    mean = ~ mean(.x, na.rm = TRUE),
    median = ~ median(.x, na.rm = TRUE),
    q75 = ~ quantile(.x, 0.75, na.rm = TRUE),
    max = ~ max(.x, na.rm = TRUE),
    sd = ~ sd(.x, na.rm = TRUE)
  ),
  .names = "{.col}_{.fn}")) 

# COVID period
task1_type_covid <- task1_type %>% 
  filter(date >= "2020-02-14" & date <= "2020-03-20") %>% 
  # Exclude the first 29 days' observation to avoid the 30-day rolling window affect the summary statistics result
  select(-date, -starts_with("scaled_")) %>% 
  group_by(security_type) %>%
  summarise(across(everything(), list(
    min = ~ min(.x, na.rm = TRUE),
    q25 = ~ quantile(.x, 0.25, na.rm = TRUE),
    mean = ~ mean(.x, na.rm = TRUE),
    median = ~ median(.x, na.rm = TRUE),
    q75 = ~ quantile(.x, 0.75, na.rm = TRUE),
    max = ~ max(.x, na.rm = TRUE),
    sd = ~ sd(.x, na.rm = TRUE)
  ),
  .names = "{.col}_{.fn}")) 

print(task1_type_noncovid)
## # A tibble: 2 × 50
##   security_type  RET_min   RET_q25 RET_mean RET_median RET_q75 RET_max  RET_sd
##   <chr>            <dbl>     <dbl>    <dbl>      <dbl>   <dbl>   <dbl>   <dbl>
## 1 ETF           -0.00298 -0.000675 0.000909  0.0000341 0.00273 0.00846 0.00269
## 2 Stock         -0.00738 -0.00292  0.00131   0.000338  0.00381 0.0171  0.00573
## # ℹ 42 more variables: volatility_min <dbl>, volatility_q25 <dbl>,
## #   volatility_mean <dbl>, volatility_median <dbl>, volatility_q75 <dbl>,
## #   volatility_max <dbl>, volatility_sd <dbl>, sharpe_ratio_min <dbl>,
## #   sharpe_ratio_q25 <dbl>, sharpe_ratio_mean <dbl>, sharpe_ratio_median <dbl>,
## #   sharpe_ratio_q75 <dbl>, sharpe_ratio_max <dbl>, sharpe_ratio_sd <dbl>,
## #   dollar_vol_min <dbl>, dollar_vol_q25 <dbl>, dollar_vol_mean <dbl>,
## #   dollar_vol_median <dbl>, dollar_vol_q75 <dbl>, dollar_vol_max <dbl>, …
print(task1_type_covid)
## # A tibble: 2 × 50
##   security_type RET_min RET_q25 RET_mean RET_median RET_q75 RET_max RET_sd
##   <chr>           <dbl>   <dbl>    <dbl>      <dbl>   <dbl>   <dbl>  <dbl>
## 1 ETF           -0.0806 -0.0228  -0.0122   -0.00954 0.00387  0.0465 0.0321
## 2 Stock         -0.121  -0.0319  -0.0168   -0.0124  0.00688  0.0749 0.0501
## # ℹ 42 more variables: volatility_min <dbl>, volatility_q25 <dbl>,
## #   volatility_mean <dbl>, volatility_median <dbl>, volatility_q75 <dbl>,
## #   volatility_max <dbl>, volatility_sd <dbl>, sharpe_ratio_min <dbl>,
## #   sharpe_ratio_q25 <dbl>, sharpe_ratio_mean <dbl>, sharpe_ratio_median <dbl>,
## #   sharpe_ratio_q75 <dbl>, sharpe_ratio_max <dbl>, sharpe_ratio_sd <dbl>,
## #   dollar_vol_min <dbl>, dollar_vol_q25 <dbl>, dollar_vol_mean <dbl>,
## #   dollar_vol_median <dbl>, dollar_vol_q75 <dbl>, dollar_vol_max <dbl>, …

Save the descriptive summary in csv files (For report)

# Overall
write.csv(task1_overall_noncovid, file = "task1_overall_noncovid.csv", row.names = FALSE)
write.csv(task1_overall_covid, file = "task1_overall_covid.csv", row.names = FALSE)

# By Sector
write.csv(task1_sector_noncovid, file = "task1_sector_noncovid.csv", row.names = FALSE)
write.csv(task1_sector_covid, file = "task1_sector_covid.csv", row.names = FALSE)

# By Company size
write.csv(task1_size_noncovid, file = "task1_size_noncovid.csv", row.names = FALSE)
write.csv(task1_size_covid, file = "task1_size_covid.csv", row.names = FALSE)

# By security type
write.csv(task1_type_noncovid, file = "task1_type_noncovid.csv", row.names = FALSE)
write.csv(task1_type_covid, file = "task1_type_covid.csv", row.names = FALSE)

Time series plot

Function to create time series plot (Overall)

# Dataset for this plot is time series
time_series_plot_overall <- function(data, variables, var_names, periods) {
  
  # Convert the dataset to long format
  data_long <- data %>%
    pivot_longer(cols = all_of(variables), 
                 names_to = "variable", 
                 values_to = "value") %>%
    mutate(variable = factor(variable, 
                             levels = variables, 
                             labels = var_names)
           )
  
  # Create shaded areas for showing comparison periods
  shaded_areas <- bind_rows(
    lapply(periods, function(p) {
      data.frame(xmin = as.Date(p$start), 
                 xmax = as.Date(p$end), 
                 ymin = -Inf, 
                 ymax = Inf, 
                 fill = p$color, 
                 label = p$label)
    })
  )
  
  # Create plots for each variable
  create_plot <- function(var_name, dataset_name) {
    plot_data <- data_long %>% 
      filter(variable == var_name)
    
    p <- ggplot(plot_data, aes(x = date, 
                               y = value)) +
      
      geom_rect(data = shaded_areas, 
                aes(xmin = xmin, 
                    xmax = xmax, 
                    ymin = ymin, 
                    ymax = ymax, 
                    fill = label),
                alpha = 0.2, 
                inherit.aes = FALSE) +
      
      geom_line(aes(color = "Data"), 
                size = 1) +
      
      scale_fill_manual(values = setNames(unique(shaded_areas$fill),
                                          unique(shaded_areas$label)),
                        name = "Period") +
      
      guides(fill = guide_legend(title = "Periods for Comparison"),
             color = guide_legend(title = "Metrics")) +
      
      labs(title = paste("Overall Performance of", var_name, "during 20 August 2019 to 20 August 2020"),
           x = "Date", 
           y = paste("Standardised", var_name)) +
      
      scale_x_date(date_labels = "%Y-%m-%d", date_breaks = "1 month") +
      
      theme_minimal() +
      
      theme(axis.text.x = element_text(angle = 55, hjust = 1)) +
      
      scale_y_continuous(expand = expansion(mult = c(0.05, 0.1))) +
      
      scale_color_manual(name = "Metrics",
                         values = c("Data" = "red"),
                         labels = c("Data" = var_name))

    # Save plot for report
    save_plot(p, paste0(dataset_name, "_", var_name))
    
    return(p)
  }
  
  # To keep the saved plot file name easy to read
  dataset_name <- deparse(substitute(data))
  
  # Create a list of plots
  plots <- lapply(var_names, function(var_name) {
    create_plot(var_name, dataset_name)
  })
  
  # Print plots
  for (plot in plots) {
    print(plot)
  }
}

Function to create time series plot (By group)

time_series_plot_group <- function(data, variables, var_names, label_name, periods, group, color_map) {
  
  # Convert the data to long format
  data_long <- data %>%
    pivot_longer(cols = all_of(variables), 
                 names_to = "variable", 
                 values_to = "value") %>%
    mutate(variable = factor(variable, 
                             levels = variables, 
                             labels = var_names)) %>%
    filter(!is.na(!!sym(group)))
  
  # Define color palette for the group variable
  group_colors <- brewer.pal(n = length(unique(data_long[[group]])), 
                             name = color_map)
  names(group_colors) <- unique(data_long[[group]])
  
  # Create shaded areas for showing comparison periods
  shaded_areas <- do.call(rbind, 
                          lapply(periods, function(p) {
                            data.frame(xmin = as.Date(p$start),
                                       xmax = as.Date(p$end), 
                                       ymin = -Inf, 
                                       ymax = Inf, 
                                       fill = p$color, 
                                       label = p$label)
                          })
  )
  
  # Create plots for each variable
  create_plot <- function(var_name, dataset_name) {
    plot_data <- data_long %>% 
      filter(variable == var_name)
    
    # Initialise the plot
    p <- ggplot() +
      
      # Loop over each group and plot its points and lines in order
      lapply(unique(plot_data[[group]]), function(g) {
        list(
          geom_point(data = plot_data %>% filter(!!sym(group) == g),
                     aes(x = date, y = value, color = !!sym(group)), size = 2, alpha = 0.9),
          geom_line(data = plot_data %>% filter(!!sym(group) == g),
                    aes(x = date, y = value, color = !!sym(group)), size = 1)
        )
      }) +
      
      # Create shaded comparison periods
      geom_rect(data = shaded_areas, 
                aes(xmin = xmin, 
                    xmax = xmax, 
                    ymin = ymin, 
                    ymax = ymax, 
                    fill = label),
                alpha = 0.2, 
                inherit.aes = FALSE) +
      
      # Customise colors and legends
      scale_color_manual(values = group_colors, 
                         name = label_name) +
      scale_fill_manual(values = setNames(unique(shaded_areas$fill),
                                          unique(shaded_areas$label)),
                        name = "Period") +
      
      # Customise labels and titles
      labs(title = paste("Overall Performance of", var_name, "during 20 August 2019 to 20 August 2020"),
           subtitle = paste("( Group by", label_name, ")"),
           x = "Date", 
           y = paste("Standardised", var_name)) +
      
      # Customise the x-axis and y-axis
      scale_x_date(date_labels = "%Y-%m-%d", 
                   date_breaks = "1 month") +
      theme_minimal() +
      theme(axis.text.x = element_text(angle = 55, 
                                       hjust = 1)) +
      scale_y_continuous(expand = expansion(mult = c(0.05, 0.1)))
    
    # Save the plot
    save_plot(p, paste0(dataset_name, "_", var_name))
    
    return(p)
  }
  
  dataset_name <- deparse(substitute(data))
  
  plots <- lapply(var_names, function(var_name) {
    create_plot(var_name, dataset_name)
  })
  
  # Print plots
  for (plot in plots) {
    print(plot)
  }
}

Call function to create plot(s)

# Define the variables and their corresponding names
variables_scaled <- c('scaled_RET', 'scaled_volatility', 'scaled_sharpe_ratio', 'scaled_spread', 'scaled_dollar_vol', 'scaled_numtrd', 'scaled_turnover_ratio')
var_names <- c('Returns', 'Volatility', 'Sharpe Ratio', 'Bid-Ask Spread', 'Dollar Volume', 'Number of Trade', 'Turnover Ratio')

# Define the dates and colors for shaded areas
periods_group <- list(
  list(start = "2019-10-14", end = "2019-11-20", color = "#937613", label = "Non-COVID Period\n(2019-10-14 to 2019-11-20)"),
  list(start = "2020-02-14", end = "2020-03-20", color = "black", label = "COVID Period\n(2020-02-14 to 2020-03-20)")
  )

# Overall Performance
time_series_plot_overall(task1_overall, variables_scaled, var_names, periods_group)

# By security type
time_series_plot_group(task1_type, variables_scaled, var_names, "Security Types", periods_group, 'security_type', color_map = "Dark2")

# By industry sector
time_series_plot_group(task1_sector, variables_scaled, var_names,"Industry Sectors", periods_group, 'SECTOR', color_map = "Paired")

# By Company size
time_series_plot_group(task1_size, variables_scaled, var_names, "Company sizes", periods_group, 'company_size', color_map = "Set1")

Function to create time series plot (Market Performance and Risk Metrics)

market_risk <- function(data, variables, var_names, periods) {
  
  # Convert data to long format
  data_long <- data %>%
    pivot_longer(cols = all_of(variables), 
                 names_to = "variable", 
                 values_to = "value") %>%
    mutate(variable = factor(variable, 
                             levels = variables, 
                             labels = var_names)
           )
  
  # Create shaded areas data frame
  shaded_areas <- do.call(rbind, 
                          lapply(periods, function(p) {
                            data.frame(xmin = as.Date(p$start), 
                                       xmax = as.Date(p$end), 
                                       ymin = -Inf, 
                                       ymax = Inf, 
                                       fill = p$color, 
                                       label = p$label)
                            })
                          )
  
  # Define color mapping for each variable
  color_mapping <- c(
    "Returns" = "#f39906",
    "Volatility" = "#6402bb",
    "Sharpe Ratio" = "#32cd4c"
    )
  
# Define line type mapping
  linetype_mapping <- c(
    "Returns" = "solid",
    "Volatility" = "solid",
    "Sharpe Ratio" = "solid"
    )
  
  # Create the plot
  ggplot() +
    
    # Plot primary variables with unique colors and line types
    geom_line(data = data_long, 
              aes(x = date, 
                  y = value, 
                  color = variable, 
                  linetype = variable)
              ) +
    
    # Add shaded areas
    geom_rect(data = shaded_areas, 
              aes(xmin = xmin, 
                  xmax = xmax, 
                  ymin = ymin, 
                  ymax = ymax, 
                  fill = label),
              alpha = 0.2, 
              inherit.aes = FALSE) +
    
    # Define color and line type scales
    scale_color_manual(name = "Metrics",
                       values = color_mapping) +
    
    scale_linetype_manual(name = "Metrics", 
                          values = linetype_mapping) +
    
    # Define fill scales for shaded areas
    scale_fill_manual(values = setNames(unique(shaded_areas$fill),
                                        unique(shaded_areas$label)),
                      name = "Period") +
    
    # Define axis labels and limits
    labs(title = "Market Performance and Risk Metrics: August 20, 2019 to August 20, 2020", 
         x = "Date", 
         y = "Standardised Values of Performance Metrics") +
    
    scale_x_date(date_labels = "%Y-%m-%d", date_breaks = "1 month") +
    
    scale_y_continuous(limits = c(-5.5, 5.5), breaks = seq(-5.0, 5.0, by = 2)) +
    
    theme_minimal() +
    
    theme(axis.text.x = element_text(angle = 55, hjust = 1))
}

# Define variables
variables_risk <- c('scaled_RET', 'scaled_volatility', 'scaled_sharpe_ratio')
var_names_risk <- c('Returns', 'Volatility', 'Sharpe Ratio')

# Call function to create plot
performance_risk <- market_risk(task1_overall, variables_risk, var_names_risk, periods_group)

print(performance_risk)

# Save the graph
save_plot(performance_risk, "Time Series Plot for Market Performance and Risk Metrics")

Function to create time series plot (Market liquidity and trading behavior Metrics)

market_liquidity <- function(data, variables, var_names, periods) {
  
  # Convert data to long format
  data_long <- data %>%
    pivot_longer(cols = all_of(variables), 
                 names_to = "variable", 
                 values_to = "value") %>%
    mutate(variable = factor(variable, 
                             levels = variables, 
                             labels = var_names)
           )

  
  # Create shaded areas data frame
  shaded_areas <- do.call(rbind, 
                          lapply(periods, function(p) {
                            data.frame(xmin = as.Date(p$start), 
                                       xmax = as.Date(p$end), 
                                       ymin = -Inf, 
                                       ymax = Inf, 
                                       fill = p$color, 
                                       label = p$label)
                            })
                          )
  
  # Define color mapping for each variable
  color_mapping <- c(
    "Bid-Ask Spread" = "#f39906",
    "Dollar Volume" = "#6402bb",
    "Number of Trade" = "#e85697",
    "Turnover Ratio" = "#32cd4c",
    "Volatility" = "black"
    )
  
# Define line type mapping (solid for all except Max Drawdown)
  linetype_mapping <- c(
    "Bid-Ask Spread" = "solid",
    "Dollar Volume" = "solid",
    "Turnover Ratio" = "solid",
    "Number of Trade" = "solid",
    "Volatility" = "dashed"
    )
  
  # Create the plot
  ggplot() +
    
    # Plot primary variables with unique colors and line types
    geom_line(data = data_long, 
              aes(x = date, 
                  y = value, 
                  color = variable, 
                  linetype = variable)
              ) +
    
    # Add shaded areas
    geom_rect(data = shaded_areas, 
              aes(xmin = xmin, 
                  xmax = xmax, 
                  ymin = ymin, 
                  ymax = ymax, 
                  fill = label),
              alpha = 0.2, 
              inherit.aes = FALSE) +
    
    # Define color and line type scales
    scale_color_manual(name = "Metrics",
                       values = color_mapping) +
    
    scale_linetype_manual(name = "Metrics", 
                          values = linetype_mapping) +
    
    # Define fill scales for shaded areas
    scale_fill_manual(values = setNames(unique(shaded_areas$fill),
                                        unique(shaded_areas$label)),
                      name = "Period") +
    
    # Define axis labels and limits
    labs(title = "Market Liquidity and Trading Behavior Metrics: August 20, 2019 to August 20, 2020",
         x = "Date", 
         y = "Standardised Values of Performance Metrics") +
    
    scale_x_date(date_labels = "%Y-%m-%d", date_breaks = "1 month") +
    
    scale_y_continuous(limits = c(-3, 6), breaks = seq(-3, 6, by = 2)) +
    
    theme_minimal() +
    
    theme(axis.text.x = element_text(angle = 55, hjust = 1))
}

# Define variables
variables_liquidity <- c('scaled_volatility', 'scaled_spread', 'scaled_dollar_vol', 'scaled_numtrd', 'scaled_turnover_ratio')

var_names_liquidity <- c('Volatility', 'Bid-Ask Spread', 'Dollar Volume', 'Number of Trade', 'Turnover Ratio')

# Call function to create plot
liquidity_trading <- market_liquidity(task1_overall, variables_liquidity, var_names_liquidity, periods_group)
print(liquidity_trading)

# Save the graph
save_plot(liquidity_trading, "Time Series Plot for Market Liquidity and Trading Behavior Metrics")

2 OLS Regressions

Data Preparation

# load library
library(broom)
library(caret)
library(reshape2)
library(car)

Calculate Fama-French Factor for Non-COVID period

task2 <- dataset1_df9

# Prepare dataset
noncovid_filtered <- task2 %>% 
  filter(date < "2019-12-14") %>% 
  select(PERMNO, date, RET) %>% 
  left_join(fama_french_factors, by = "date")

# Initialise an empty list to store beta values
beta_list <- list()

# Loop through each unique PERMNO
for (permno in unique(noncovid_filtered$PERMNO)) {
  # Filter the data for the current PERMNO
  FFF_beta_noncovid <- subset(noncovid_filtered, PERMNO == permno)
  
  # Run the regression to calculate betas
  model_beta <- lm(RET ~ Mkt.RF + SMB + HML, data = FFF_beta_noncovid)
  
  # Store the betas with the PERMNO
  beta_list[[permno]] <- coef(model_beta)
}

# Convert the list of betas to a data frame
# Convert list to data frame with PERMNO names as row names
FFF_beta_noncovid <- do.call(rbind, lapply(names(beta_list), function(x) {
  as.data.frame(t(beta_list[[x]]), row.names = x)
}))

# Reset row names to a column
FFF_beta_noncovid <- tibble::rownames_to_column(FFF_beta_noncovid, var = "PERMNO")

# Rename columns to reflect beta coefficients
FFF_beta_noncovid <- FFF_beta_noncovid %>%
  rename(
    Mkt_beta = Mkt.RF,
    SMB_beta = SMB,
    HML_beta = HML
  ) %>% 
  select(-"(Intercept)")

FFF_beta_noncovid$PERMNO <- as.character(FFF_beta_noncovid$PERMNO)

print(head(FFF_beta_noncovid, n = 5))
##   PERMNO     Mkt_beta      SMB_beta      HML_beta
## 1  10026 0.0036246336  0.0018566596 -0.0024391910
## 2  10028 0.0187801943  0.0034850211  0.0017528928
## 3  10032 0.0151359631  0.0055842491 -0.0020193657
## 4  10044 0.0002896842 -0.0006347077 -0.0019975343
## 5  10051 0.0065680513  0.0135855432  0.0009297792

Calculate Fama-French Factor for COVID period

# Prepare dataset
covid_filtered <- task2 %>% 
  filter(date < "2020-02-14") %>% 
  select(PERMNO, date, RET) %>% 
  left_join(fama_french_factors, by = "date")

# Initialise an empty list to store beta values
beta_list <- list()

# Loop through each unique PERMNO
for (permno in unique(covid_filtered$PERMNO)) {
  # Filter the data for the current PERMNO
  FFF_beta_covid <- subset(covid_filtered, PERMNO == permno)
  
  # Run the regression to calculate betas
  model_beta <- lm(RET ~ Mkt.RF + SMB + HML, data = FFF_beta_covid)
  
  # Store the betas with the PERMNO
  beta_list[[permno]] <- coef(model_beta)
}

# Convert the list of betas to a data frame
# Convert list to data frame with PERMNO names as row names
FFF_beta_covid <- do.call(rbind, lapply(names(beta_list), function(x) {
  as.data.frame(t(beta_list[[x]]), row.names = x)
}))

# Reset row names to a column
FFF_beta_covid <- tibble::rownames_to_column(FFF_beta_covid, var = "PERMNO")

# Rename columns to reflect beta coefficients
FFF_beta_covid <- FFF_beta_covid %>%
  rename(
    Mkt_beta = Mkt.RF,
    SMB_beta = SMB,
    HML_beta = HML
  ) %>% 
  select(-"(Intercept)")

FFF_beta_covid$PERMNO <- as.character(FFF_beta_covid$PERMNO)

print(head(FFF_beta_covid, n = 5))
##   PERMNO     Mkt_beta     SMB_beta      HML_beta
## 1  10026 0.0016996314  0.001972067 -0.0005208422
## 2  10028 0.0142608706 -0.002256404  0.0002025232
## 3  10032 0.0141317851  0.005488963  0.0004350972
## 4  10044 0.0003835403  0.001919091 -0.0025091552
## 5  10051 0.0039712741  0.012994810  0.0029740526

Transform cross-sectional dataset function

transform_dataset_for_task2 <- function(data, start_date_RET, end_date_RET, start_date_others, end_date_others, type_of_security, FFF_dataset) {
  epsilon <- 1e-6
  
  # Find the date with the most PERMNOs having the highest price
  highest_change <- data %>%
    filter(date >= as.Date(start_date_RET) & date <= as.Date(end_date_RET)) %>%
    group_by(PERMNO) %>%
    filter(PRC == max(PRC)) %>%
    ungroup() %>%
    group_by(date) %>%
    summarise(num_highest = n()) %>%
    arrange(desc(num_highest)) 
  
  highest_date <- highest_change$date[1]
  cat("Highest Date: ")
  print(highest_change$date[1])

  # Find the date with the most PERMNOs having the lowest price
  lowest_change <- data %>%
    filter(date >= as.Date(start_date_RET) & date <= as.Date(end_date_RET)) %>%
    group_by(PERMNO) %>%
    filter(PRC == min(PRC)) %>%
    ungroup() %>%
    group_by(date) %>%
    summarise(num_lowest = n()) %>%
    arrange(desc(num_lowest))
  
  lowest_date <- lowest_change$date[1]
  cat("\nLowest Date: ")
  print(lowest_change$date[1])
  
  # Calculate the return using the lowest_date's and highest_date's price
  data_RET <- data %>%
    filter(date %in% c(highest_date, lowest_date)) %>%
    select(PERMNO, date, PRC) %>%
    spread(key = date, value = PRC, fill = NA) %>%
    rename(price_highest = !!as.character(highest_date), 
           price_lowest = !!as.character(lowest_date)) %>%
    mutate(returns = (price_highest - price_lowest) / price_lowest) %>%
    select(PERMNO, returns)
  
  cat("\nNumber of PERMNO in the dataset that calculated returns: ", length(unique(data_RET$PERMNO)), "\n")
  
  # Filter the data for two periods 
  data_filtered <- data %>%
    filter(date >= as.Date(start_date_others) & date <= as.Date(end_date_others)) %>%
    arrange(PERMNO, date)

  # Filter securities type and calculate aggregated variables
  data_others <- data_filtered %>%
    filter(
      type_of_security == "both" | 
      (type_of_security == "ETFs" & SHRCD == 73) |
      (type_of_security == "stocks" & SHRCD == 11)
    ) %>%
    group_by(PERMNO) %>%
    summarise(
      SHRCD = first(SHRCD),
      TICKER = first(TICKER),
      SECTOR = first(SECTOR),
      company_size = first(company_size),
      VOL = sum(VOL, na.rm = TRUE),
      dollar_vol = sum(dollar_vol, na.rm = TRUE),
      bid_ask_spread = mean(bid_ask_spread, na.rm = TRUE),
      turnover_ratio = mean(turnover_ratio, na.rm = TRUE),
      volatility = mean(volatility, na.rm = TRUE),
      sharpe_ratio = mean(sharpe_ratio, na.rm = TRUE),
      market_share = sum(market_share, na.rm = TRUE),
      market_cap = sum(market_cap, na.rm = TRUE),
      abs_corr_sp = mean(abs_corr_sp, na.rm = TRUE),
      security_type = first(security_type),
      tracking_error = mean(tracking_error, na.rm = TRUE),
      log_vol = log(sum(VOL + epsilon, na.rm = TRUE)),
      log_dollar_vol= log(sum(dollar_vol + epsilon, na.rm = TRUE)),
      log_spread = log(mean(bid_ask_spread + epsilon, na.rm = TRUE)),
      log_turnover_ratio = log(mean(turnover_ratio + epsilon, na.rm = TRUE)),
      log_volatility = log(mean(volatility + epsilon, na.rm = TRUE)),
      log_market_share = log(sum(market_share + epsilon, na.rm = TRUE)),
      log_market_cap = log(sum(market_cap + epsilon, na.rm = TRUE)),
      log_tracking_error = log(mean(tracking_error + epsilon, na.rm = TRUE)),
      vol_cap_ratio = VOL / market_cap,
      log_vol_cap_ratio = log(vol_cap_ratio),
      .groups = "drop"
    ) %>%
    mutate(
      scaled_vol = scale(VOL),
      scaled_dollar_vol = scale(dollar_vol),
      scaled_spread = scale(bid_ask_spread),
      scaled_turnover_ratio = scale(turnover_ratio),
      scaled_volatility = scale(volatility),
      scaled_sharpe_ratio = scale(sharpe_ratio),
      scaled_market_share = scale(market_share),
      scaled_market_cap = scale(market_cap),
      scaled_abs_corr_sp = scale(abs_corr_sp),
      scaled_tracking_error = scale(tracking_error)
    ) 
  
  # Check for duplicates
  duplicates <- data_others %>%
    group_by(PERMNO) %>%
    filter(n() > 1) %>%
    summarise(count = n())
  
  # Debugging
  if(nrow(duplicates) > 0) {
    warning("There are duplicates: ", paste(duplicates$PERMNO, collapse = ", "))
  }
  
  cat("Number of PERMNO that are in the dataset that aggregate x variables: ", length(unique(data_others$PERMNO)), "\n")
  
  cat("Number of PERMNO that are in the both datasets: ", length(unique(data_RET$PERMNO[data_RET$PERMNO %in% data_others$PERMNO])), "\n")
  
  # Join the returns values with other variables
  final_data <- data_RET %>%
    filter(PERMNO %in% data_others$PERMNO) %>%
    left_join(data_others, by = "PERMNO") %>%
    ungroup() %>%  
    left_join(FFF_dataset, by = "PERMNO") %>% 
    filter(returns != 0)
  
  cat("Number of PERMNO that are in the final dataset: ", length(unique(final_data$PERMNO)), "\n")
  
  return(final_data)
}

Function to remove outlier

#Function to remove outlier
remove_outliers_all <- function(data, variables) {
  # Initialise a logical vector to keep track of rows to retain
  keep_rows <- rep(TRUE, nrow(data))
  
  # Loop over each variable
  for (variable in variables) {
    # Compute outlier bounds
    bounds <- data %>%
      summarise(
        Q1 = quantile(.data[[variable]], 0.25, na.rm = TRUE),
        Q3 = quantile(.data[[variable]], 0.75, na.rm = TRUE),
        IQR = Q3 - Q1,
        lower_bound = Q1 - 1.5 * IQR,
        upper_bound = Q3 + 1.5 * IQR
      )
    
    # Apply the filtering based on computed bounds
    keep_rows <- keep_rows & (data[[variable]] >= bounds$lower_bound & data[[variable]] <= bounds$upper_bound)
  }
  
  # Return the cleaned data
  return(data[keep_rows, ])
}

Model 1: Returns: 2019-12-14 to 2020-01-20 & X variables: before 2019-02-14

# Period 1 - Dataset 1
task2_p1_df1_1 <- transform_dataset_for_task2(task2, "2019-12-14", "2020-01-20", "2019-08-20", "2019-12-14", "both", FFF_beta_noncovid) 
## Highest Date: [1] "2020-01-17"
## 
## Lowest Date: [1] "2019-12-16"
## 
## Number of PERMNO in the dataset that calculated returns:  5790 
## Number of PERMNO that are in the dataset that aggregate x variables:  5840 
## Number of PERMNO that are in the both datasets:  5746 
## Number of PERMNO that are in the final dataset:  5682
# Remove outlier for returns
task2_p1_df1_1 <- remove_outliers_all(task2_p1_df1_1, "returns")

# Inspect the distribution of Returns
print(
  ggplot(task2_p1_df1_1, aes(x = returns)) +
    geom_histogram(bins = 30, fill = "blue", color = "black", alpha = 0.7) +
    theme_minimal() +
    labs(title = "Histogram of Returns", x = "Returns", y = "Frequency")
)

cat("Sample size after remove outlier: ", length(unique(task2_p1_df1_1$PERMNO)))
## Sample size after remove outlier:  4919
# Create dummy variable for categorical variable
task2_reg_p1_df1 <- task2_p1_df1_1 %>%
  filter(!is.na(volatility)) %>% 
  mutate(company_size = factor(company_size, levels = unique(task2_p1_df1_1$company_size)),
         security_type = factor(security_type, levels = unique(task2_p1_df1_1$security_type)),
         SECTOR = factor(SECTOR, levels = unique(task2_p1_df1_1$SECTOR))) %>%
  bind_cols(as_tibble(model.matrix(~ company_size - 1, data = .))) %>%
  bind_cols(as_tibble(model.matrix(~ security_type - 1, data = .))) %>% 
  bind_cols(as_tibble(model.matrix(~ SECTOR - 1, data = .))) 
## tibble [4,881 × 57] (S3: tbl_df/tbl/data.frame)
##  $ PERMNO                      : chr [1:4881] "10026" "10028" "10032" "10044" ...
##  $ returns                     : num [1:4881] 0.01925 0.1259 0.01087 -0.00613 -0.00417 ...
##  $ SHRCD                       : int [1:4881] 11 11 11 11 11 11 11 73 11 11 ...
##  $ TICKER                      : chr [1:4881] "JJSF" "DGSE" "PLXS" "RMCF" ...
##  $ SECTOR                      : Factor w/ 12 levels "Consumer_Staples",..: 1 2 3 1 2 3 3 2 4 5 ...
##  $ company_size                : Factor w/ 3 levels "Medium","Small",..: 1 2 2 2 2 3 3 2 3 3 ...
##  $ VOL                         : num [1:4881] 7586162 4650390 13170524 658814 32686198 ...
##  $ dollar_vol                  : num [1:4881] 1.44e+09 6.27e+06 9.09e+08 5.93e+06 7.24e+08 ...
##  $ bid_ask_spread              : num [1:4881] 0.00109 0.015923 0.000649 0.011499 0.000773 ...
##  $ turnover_ratio              : num [1:4881] 0.0049 0.00211 0.00551 0.00134 0.01069 ...
##  $ volatility                  : num [1:4881] 0.176 0.658 0.28 0.153 0.364 ...
##  $ sharpe_ratio                : num [1:4881] -143.4 -38.2 -91 -161.8 -68.9 ...
##  $ market_share                : num [1:4881] 0.0286 0 0.0252 0.2334 0.0562 ...
##  $ market_cap                  : num [1:4881] 2.94e+11 2.76e+09 1.62e+11 4.45e+09 6.75e+10 ...
##  $ abs_corr_sp                 : num [1:4881] 0.72 0.318 0.635 0.193 0.595 ...
##  $ security_type               : Factor w/ 2 levels "Stock","ETF": 1 1 1 1 1 1 1 2 1 1 ...
##  $ tracking_error              : num [1:4881] 0.0225 0.0523 0.0261 0.0403 0.0324 ...
##  $ log_vol                     : num [1:4881] 15.8 15.4 16.4 13.4 17.3 ...
##  $ log_dollar_vol              : num [1:4881] 21.1 15.7 20.6 15.6 20.4 ...
##  $ log_spread                  : num [1:4881] -6.82 -4.14 -7.34 -4.47 -7.16 ...
##  $ log_turnover_ratio          : num [1:4881] -5.32 -6.16 -5.2 -6.61 -4.54 ...
##  $ log_volatility              : num [1:4881] -1.736 -0.418 -1.274 -1.878 -1.011 ...
##  $ log_market_share            : num [1:4881] -3.56 -13.82 -3.68 -1.46 -2.88 ...
##  $ log_market_cap              : num [1:4881] 26.4 21.7 25.8 22.2 24.9 ...
##  $ log_tracking_error          : num [1:4881] -3.79 -2.95 -3.65 -3.21 -3.43 ...
##  $ vol_cap_ratio               : num [1:4881] 2.58e-05 1.69e-03 8.15e-05 1.48e-04 4.84e-04 ...
##  $ log_vol_cap_ratio           : num [1:4881] -10.56 -6.38 -9.41 -8.82 -7.63 ...
##  $ scaled_vol                  : num [1:4881, 1] -0.254 -0.265 -0.231 -0.281 -0.153 ...
##   ..- attr(*, "scaled:center")= num 7.1e+07
##   ..- attr(*, "scaled:scale")= num 2.5e+08
##  $ scaled_dollar_vol           : num [1:4881, 1] -0.0912 -0.1478 -0.1122 -0.1478 -0.1195 ...
##   ..- attr(*, "scaled:center")= num 3.75e+09
##   ..- attr(*, "scaled:scale")= num 2.53e+10
##  $ scaled_spread               : num [1:4881, 1] -0.373 0.919 -0.412 0.534 -0.401 ...
##   ..- attr(*, "scaled:center")= num 0.00537
##   ..- attr(*, "scaled:scale")= num 0.0115
##  $ scaled_turnover_ratio       : num [1:4881, 1] -0.0436 -0.0523 -0.0417 -0.0546 -0.0256 ...
##   ..- attr(*, "scaled:center")= num 0.0189
##   ..- attr(*, "scaled:scale")= num 0.321
##  $ scaled_volatility           : num [1:4881, 1] -0.451 0.825 -0.177 -0.513 0.045 ...
##   ..- attr(*, "scaled:center")= num 0.347
##   ..- attr(*, "scaled:scale")= num 0.378
##  $ scaled_sharpe_ratio         : num [1:4881, 1] 0.125 0.281 0.203 0.098 0.236 ...
##   ..- attr(*, "scaled:center")= num -228
##   ..- attr(*, "scaled:scale")= num 675
##  $ scaled_market_share         : num [1:4881, 1] -0.147 -0.152 -0.148 -0.117 -0.143 ...
##   ..- attr(*, "scaled:center")= num 1.03
##   ..- attr(*, "scaled:scale")= num 6.79
##  $ scaled_market_cap           : num [1:4881, 1] -0.0668 -0.1749 -0.1159 -0.1743 -0.1509 ...
##   ..- attr(*, "scaled:center")= num 4.73e+11
##   ..- attr(*, "scaled:scale")= num 2.69e+12
##  $ scaled_abs_corr_sp          : num [1:4881, 1] 0.5055 -0.9971 0.1873 -1.4662 0.0368 ...
##   ..- attr(*, "scaled:center")= num 0.585
##   ..- attr(*, "scaled:scale")= num 0.267
##  $ scaled_tracking_error       : num [1:4881, 1] -0.3746 0.5239 -0.2674 0.1616 -0.0769 ...
##   ..- attr(*, "scaled:center")= num 0.0349
##   ..- attr(*, "scaled:scale")= num 0.0332
##  $ Mkt_beta                    : num [1:4881] 0.00362 0.01878 0.01514 0.00029 0.00657 ...
##  $ SMB_beta                    : num [1:4881] 0.001857 0.003485 0.005584 -0.000635 0.013586 ...
##  $ HML_beta                    : num [1:4881] -0.00244 0.00175 -0.00202 -0.002 0.00093 ...
##  $ company_sizeMedium          : num [1:4881] 1 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:3] 1 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ company_size: chr "contr.treatment"
##  $ company_sizeSmall           : num [1:4881] 0 1 1 1 1 0 0 1 0 0 ...
##   ..- attr(*, "assign")= int [1:3] 1 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ company_size: chr "contr.treatment"
##  $ company_sizeLarge           : num [1:4881] 0 0 0 0 0 1 1 0 1 1 ...
##   ..- attr(*, "assign")= int [1:3] 1 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ company_size: chr "contr.treatment"
##  $ security_typeStock          : num [1:4881] 1 1 1 1 1 1 1 0 1 1 ...
##   ..- attr(*, "assign")= int [1:2] 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ security_type: chr "contr.treatment"
##  $ security_typeETF            : num [1:4881] 0 0 0 0 0 0 0 1 0 0 ...
##   ..- attr(*, "assign")= int [1:2] 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ security_type: chr "contr.treatment"
##  $ SECTORConsumer_Staples      : num [1:4881] 1 0 0 1 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORUnknown               : num [1:4881] 0 1 0 0 1 0 0 1 0 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORInformation_Technology: num [1:4881] 0 0 1 0 0 1 1 0 0 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORFinancials            : num [1:4881] 0 0 0 0 0 0 0 0 1 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORIndustrials           : num [1:4881] 0 0 0 0 0 0 0 0 0 1 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORHealth_Care           : num [1:4881] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORConsumer_Discretionary: num [1:4881] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORMaterials             : num [1:4881] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORReal_Estate           : num [1:4881] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORUtilities             : num [1:4881] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTOREnergy                : num [1:4881] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORCommunication_Services: num [1:4881] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
## # A tibble: 10 × 57
##    PERMNO  returns SHRCD TICKER SECTOR            company_size    VOL dollar_vol
##    <chr>     <dbl> <int> <chr>  <fct>             <fct>         <dbl>      <dbl>
##  1 10026   0.0192     11 JJSF   Consumer_Staples  Medium       7.59e6    1.44e 9
##  2 10028   0.126      11 DGSE   Unknown           Small        4.65e6    6.27e 6
##  3 10032   0.0109     11 PLXS   Information_Tech… Small        1.32e7    9.09e 8
##  4 10044  -0.00613    11 RMCF   Consumer_Staples  Small        6.59e5    5.93e 6
##  5 10051  -0.00417    11 HNGR   Unknown           Small        3.27e7    7.24e 8
##  6 10104   0.0211     11 ORCL   Information_Tech… Large        9.55e8    5.21e10
##  7 10107   0.0744     11 MSFT   Information_Tech… Large        1.84e9    2.61e11
##  8 10113   0.0534     73 AADR   Unknown           Small        6.25e5    3.09e 7
##  9 10138   0.0732     11 TROW   Financials        Large        7.70e7    8.87e 9
## 10 10145   0.0387     11 HON    Industrials       Large        2.26e8    3.85e10
## # ℹ 49 more variables: bid_ask_spread <dbl>, turnover_ratio <dbl>,
## #   volatility <dbl>, sharpe_ratio <dbl>, market_share <dbl>, market_cap <dbl>,
## #   abs_corr_sp <dbl>, security_type <fct>, tracking_error <dbl>,
## #   log_vol <dbl>, log_dollar_vol <dbl>, log_spread <dbl>,
## #   log_turnover_ratio <dbl>, log_volatility <dbl>, log_market_share <dbl>,
## #   log_market_cap <dbl>, log_tracking_error <dbl>, vol_cap_ratio <dbl>,
## #   log_vol_cap_ratio <dbl>, scaled_vol <dbl[,1]>, …

Make sure returns are accurately calculated

## The date that most PERMNOs has highest price change: 2020-01-17
## The date that most PERMNOs has lowest price change: 2019-12-16
## Number of PERMNOs in the dataset: 5929
## Number of PERMNOs on the date that most PERMNOs has highest price change: 5746
## Number of PERMNOs on the date that most PERMNOs has lowest price change: 5747
## Number of PERMNO that should be include in the dataset but did not:  0
## That PERMNO is:

Correlation of variable

#Find the correlation between retruns and each potential explanatory variables
cor_p1_m1 <- cor(task2_reg_p1_df1 %>% 
                   select(-c(PERMNO, SHRCD, TICKER, SECTOR, company_size, security_type)), 
                 use = "pairwise.complete.obs")

# Correlation over absolute 0.7 = high risk of multicollinearity
high_corr_p1_m1 <- which(abs(cor_p1_m1) > 0.7, arr.ind = TRUE)
high_corr_pairs_p1_m1 <- data.frame(
  Feature1 = rownames(cor_p1_m1)[high_corr_p1_m1[,1]],
  Feature2 = colnames(cor_p1_m1)[high_corr_p1_m1[,2]],
  Correlation = cor_p1_m1[high_corr_p1_m1]
)

high_corr_pairs_p1_m1 <- high_corr_pairs_p1_m1 %>% 
  filter(Correlation < 1)

# Inspect the variables with multicollinearity
head(high_corr_pairs_p1_m1)
##             Feature1       Feature2 Correlation
## 1     log_volatility     volatility   0.7562346
## 2 log_tracking_error tracking_error   0.8091045
## 3     log_dollar_vol        log_vol   0.9174205
## 4     log_market_cap        log_vol   0.8126475
## 5            log_vol log_dollar_vol   0.9174205
## 6         log_spread log_dollar_vol  -0.7862752
# Inspect all correlations
head(melt(cor_p1_m1))
##             Var1    Var2        value
## 1        returns returns  1.000000000
## 2            VOL returns  0.035337402
## 3     dollar_vol returns  0.050754431
## 4 bid_ask_spread returns -0.056033284
## 5 turnover_ratio returns  0.004433236
## 6     volatility returns  0.060627680

Build Model 1

# Splitting the dataset into training and testing sets
set.seed(696)  

# Non-COVID Period
X_p1_m1 <- task2_reg_p1_df1 %>% 
  select(Mkt_beta, SMB_beta, HML_beta, log_turnover_ratio, scaled_spread, log_volatility, log_market_share, vol_cap_ratio, company_sizeSmall, company_sizeLarge, SECTORMaterials, SECTORIndustrials, SECTORFinancials, SECTORInformation_Technology, SECTORConsumer_Discretionary, SECTORHealth_Care, SECTOREnergy, SECTORUtilities, SECTORReal_Estate, SECTORCommunication_Services, SECTORUnknown)

y_p1_m1 <- task2_reg_p1_df1$returns

splitIndex_p1_m1 <- createDataPartition(y_p1_m1, 
                                        p = 0.8, 
                                        list = FALSE)
X_train_data_p1_m1 <- X_p1_m1[splitIndex_p1_m1, ]
X_test_data_p1_m1 <- X_p1_m1[-splitIndex_p1_m1, ]
y_train_data_p1_m1 <- y_p1_m1[splitIndex_p1_m1]
y_test_data_p1_m1 <- y_p1_m1[-splitIndex_p1_m1]

train_data_p1_m1 <- cbind(X_train_data_p1_m1, returns = y_train_data_p1_m1)

# Build Model
model_1_p1 <- lm(returns ~ ., data = train_data_p1_m1)

# Prediction for training sample
pred_train_p1_m1 <- predict(model_1_p1, newdata = X_train_data_p1_m1)

metrics_train_p1_m1 <- postResample(pred = pred_train_p1_m1, 
                                    obs = train_data_p1_m1$returns)
metrics_table_train_p1_m1 <- data.frame(
  Metric = c('MSE', 'RMSE', 'MAE', 'R-squared'),
  Value = c(metrics_train_p1_m1["RMSE"]^2,
            metrics_train_p1_m1["RMSE"], 
            metrics_train_p1_m1["MAE"], 
            metrics_train_p1_m1["Rsquared"])
)

# Prediction for testing sample
pred_test_p1_m1 <- predict(model_1_p1, newdata = X_test_data_p1_m1)
metrics_test_p1_m1 <- postResample(pred = pred_test_p1_m1, 
                                   obs = y_test_data_p1_m1)
metrics_table_test_p1_m1 <- data.frame(
  Metric = c('MSE', 'RMSE', 'MAE', 'R-squared'),
  Value = c(metrics_test_p1_m1["RMSE"]^2
, metrics_test_p1_m1["RMSE"], metrics_test_p1_m1["MAE"], metrics_test_p1_m1["Rsquared"])
)
## Performance Metrics for the Training Dataset:
##      Metric       Value
## 1       MSE 0.002234472
## 2      RMSE 0.047270202
## 3       MAE 0.034317481
## 4 R-squared 0.105304918
## Performance Metrics for the Testing Dataset:
##      Metric       Value
## 1       MSE 0.001913322
## 2      RMSE 0.043741542
## 3       MAE 0.031795969
## 4 R-squared 0.173938416
## Regression coefficient of Model 1:
## 
## Call:
## lm(formula = returns ~ ., data = train_data_p1_m1)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.190295 -0.022907 -0.000372  0.023547  0.186048 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   0.0323218  0.0065479   4.936 8.30e-07 ***
## Mkt_beta                      0.9574882  0.1190404   8.043 1.15e-15 ***
## SMB_beta                     -0.0984585  0.1025318  -0.960  0.33698    
## HML_beta                     -1.1786067  0.1157057 -10.186  < 2e-16 ***
## log_turnover_ratio            0.0016895  0.0007948   2.126  0.03360 *  
## scaled_spread                -0.0005114  0.0010723  -0.477  0.63345    
## log_volatility                0.0048233  0.0012160   3.967 7.42e-05 ***
## log_market_share              0.0006303  0.0003347   1.883  0.05974 .  
## vol_cap_ratio                 0.0604965  0.0504603   1.199  0.23064    
## company_sizeSmall            -0.0054483  0.0022215  -2.453  0.01423 *  
## company_sizeLarge             0.0028200  0.0031286   0.901  0.36745    
## SECTORMaterials              -0.0083186  0.0074315  -1.119  0.26305    
## SECTORIndustrials             0.0083327  0.0056437   1.476  0.13990    
## SECTORFinancials             -0.0141718  0.0053945  -2.627  0.00865 ** 
## SECTORInformation_Technology  0.0162510  0.0057003   2.851  0.00438 ** 
## SECTORConsumer_Discretionary  0.0024951  0.0057585   0.433  0.66482    
## SECTORHealth_Care             0.0080007  0.0056505   1.416  0.15688    
## SECTOREnergy                  0.0124139  0.0080301   1.546  0.12221    
## SECTORUtilities               0.0112863  0.0080688   1.399  0.16197    
## SECTORReal_Estate            -0.0013079  0.0114142  -0.115  0.90878    
## SECTORCommunication_Services  0.0323416  0.0075503   4.284 1.88e-05 ***
## SECTORUnknown                 0.0103622  0.0050517   2.051  0.04031 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0474 on 3883 degrees of freedom
## Multiple R-squared:  0.1053, Adjusted R-squared:  0.1005 
## F-statistic: 21.76 on 21 and 3883 DF,  p-value: < 2.2e-16
## To make sure the variable are not multicollinear (VIF < 5)
##                     Mkt_beta                     SMB_beta 
##                     1.319676                     1.419217 
##                     HML_beta           log_turnover_ratio 
##                     1.291541                     1.474125 
##                scaled_spread               log_volatility 
##                     1.551150                     2.234558 
##             log_market_share                vol_cap_ratio 
##                     1.662150                     1.072415 
##            company_sizeSmall            company_sizeLarge 
##                     1.708948                     1.532598 
##              SECTORMaterials            SECTORIndustrials 
##                     1.689553                     3.415319 
##             SECTORFinancials SECTORInformation_Technology 
##                     4.525347                     3.307744 
## SECTORConsumer_Discretionary            SECTORHealth_Care 
##                     3.168057                     3.558749 
##                 SECTOREnergy              SECTORUtilities 
##                     1.639638                     1.542901 
##            SECTORReal_Estate SECTORCommunication_Services 
##                     1.210997                     1.646023 
##                SECTORUnknown 
##                    10.973571
## Despite SectorUnknown has a VIF over 5, considering it is actually the group of data with missing value in Sector, while some other industry categories are important to the model. After trial and error, it is better to keep the industry variable in the model.
# Scatterplot of Prediction vs Actual
plot_data_p1_m1 <- data.frame(
  Predicted = pred_test_p1_m1,
  Actual = y_test_data_p1_m1
)

ggplot(data = plot_data_p1_m1,
       aes(x = Predicted,
           y = Actual)) +
  
  geom_point() +
  
  geom_smooth(method = "lm",
              se = FALSE,
              color = "blue",
              aes(linetype = "Regression Line")) +
  
  scale_linetype_manual(name = NULL,
                        values = "solid",
                        labels = "Regression Line") +
  
  theme_minimal() +
  
  theme(legend.position = "bottom") +
  
  labs(title = "Evaluating Model 1's Performance: Predicted vs. Actual Returns in the Non-COVID Period",
       x = "Predicted returns",
       y = "Actual returns")

Important variable

# Extract coefficients
coef_m1 <- coef(model_1_p1)

importance_m1 <- data.frame(
  Factors = names(coef_m1),
  Coefficient = coef_m1
)

importance_m1 <- importance_m1[importance_m1$Factor != "(Intercept)", ]

# Plot with positive and negative impacts
plot_m1_p1_factors <- ggplot(importance_m1, 
                             aes(x = reorder(Factors, Coefficient), 
                                 y = Coefficient, fill = Coefficient > 0)) +
  
  geom_bar(stat = "identity") +
  
  geom_text(aes(label = round(Coefficient, 3)),
            vjust = ifelse(importance_m1$Coefficient > 0, 0.5, 0.3),
            color = "black") +
  
  coord_flip() +
  
  labs(title = "Impact of factors on Returns for Model 1",
       x = "Factors",
       y = "Coefficient") +
  
  theme_minimal() +
  
  scale_fill_manual(
    values = c("TRUE" = "steelblue", "FALSE" = "coral"),
    labels = c("TRUE" = "Positive Impact", 
               "FALSE" = "Negative Impact"),
    name = ""
  ) +
  
  theme(
    axis.title.y = element_text(color = 'black'),
    axis.text.y = element_text(color = 'black'),
    legend.position = 'bottom'
  ) 

print(plot_m1_p1_factors)

Group by Sector

# Apply the model particularly by industry
pred_sector_p1_m1  <- cbind(X_test_data_p1_m1, SECTOR = task2_reg_p1_df1$SECTOR[-splitIndex_p1_m1])

data_sector_p1_m1 <- data.frame(
  Predicted = pred_test_p1_m1,
  Actual = y_test_data_p1_m1,
  SECTOR = pred_sector_p1_m1$SECTOR
)

data_sector_p1_m1$SECTOR <- as.character(data_sector_p1_m1$SECTOR)

# Calculate evaluation metrics to see the performance of model apply on SECTOR
metrics_table_sector_p1_m1 <- data_sector_p1_m1 %>% 
  group_by(SECTOR) %>% 
  summarise(
    MSE = mean((Actual - Predicted)^2),
    RMSE = sqrt(MSE),
    MAE = mean(abs(Actual - Predicted)),
    residuals = sum((Actual - Predicted)^2),
    total = sum((Actual - mean(Actual))^2),
    R_Squared = 1 - (residuals / total)
  ) %>% 
  select(-c(residuals, total)) 

print(metrics_table_sector_p1_m1)
## # A tibble: 12 × 5
##    SECTOR                     MSE   RMSE    MAE R_Squared
##    <chr>                    <dbl>  <dbl>  <dbl>     <dbl>
##  1 Communication_Services 0.00436 0.0661 0.0553   0.0987 
##  2 Consumer_Discretionary 0.00300 0.0548 0.0441   0.0622 
##  3 Consumer_Staples       0.00273 0.0523 0.0410  -0.379  
##  4 Energy                 0.00422 0.0650 0.0533  -0.679  
##  5 Financials             0.00146 0.0382 0.0318   0.0991 
##  6 Health_Care            0.00360 0.0600 0.0476   0.0543 
##  7 Industrials            0.00262 0.0512 0.0385  -0.00432
##  8 Information_Technology 0.00269 0.0519 0.0420   0.120  
##  9 Materials              0.00302 0.0550 0.0447   0.0163 
## 10 Real_Estate            0.00418 0.0647 0.0575   0.297  
## 11 Unknown                0.00128 0.0358 0.0245   0.208  
## 12 Utilities              0.00315 0.0561 0.0442  -0.225
# Plot the prediction vs actual by SECTOR
plot_m1_p1_sector <- ggplot(data = data_sector_p1_m1, 
                     aes(x = Predicted, 
                         y = Actual,
                         color = SECTOR)) +
  
  geom_point(size = 2) +
  
  geom_smooth(method = "lm", 
              se = FALSE, 
              color = "blue", 
              aes(linetype = "Regression Line")) + 
  
  scale_linetype_manual(name = NULL, 
                        values = "solid", 
                        labels = "Regression Line") + 
  
  theme_minimal() +
  
  theme(legend.position = "bottom",
        plot.title = element_text(size = 14, face = "bold"),
        plot.subtitle = element_text(size = 12),
        strip.text = element_text(size = 12, face = "bold")) + 
  
  labs(title = "Evaluating Model 1's Performance: Predicted vs. Actual Returns",
       subtitle = "During the Non-COVID Period (Grouped by Industry Sector)",  
       x = "Predicted returns", 
       y = "Actual returns",
       color = "Industry Sector")


print(plot_m1_p1_sector)

Group by Security Type

# Apply the model particularly by security type
pred_type_p1_m1  <- cbind(X_test_data_p1_m1, security_type = task2_reg_p1_df1$security_type[-splitIndex_p1_m1])

data_type_p1_m1 <- data.frame(
  Predicted = pred_test_p1_m1,
  Actual = y_test_data_p1_m1,
  security_type = pred_type_p1_m1$security_type
)

# Calculate evaluation metrics to see the performance of model apply on security type
metrics_table_type_p1_m1 <- data_type_p1_m1 %>% 
  group_by(security_type) %>% 
  summarise(
    MSE = mean((Actual - Predicted)^2),
    RMSE = sqrt(MSE),
    MAE = mean(abs(Actual - Predicted)),
    residuals = sum((Actual - Predicted)^2),
    total = sum((Actual - mean(Actual))^2),
    R_Squared = 1 - (residuals / total)
  ) %>% 
  select(-c(residuals, total))

print(metrics_table_type_p1_m1)
## # A tibble: 2 × 5
##   security_type      MSE   RMSE    MAE R_Squared
##   <fct>            <dbl>  <dbl>  <dbl>     <dbl>
## 1 Stock         0.00286  0.0534 0.0421     0.134
## 2 ETF           0.000712 0.0267 0.0187     0.299
# Plot the prediction vs actual by security type
plot_m1_p1_type <- ggplot(data = data_type_p1_m1, 
                     aes(x = Predicted, 
                         y = Actual,
                         color = security_type)) +
  geom_point(size = 2) +
  geom_smooth(method = "lm", 
              se = FALSE, 
              color = "blue", 
              aes(linetype = "Regression Line")) + 
  scale_linetype_manual(name = NULL, 
                        values = "solid", 
                        labels = "Regression Line") + 
  theme_minimal() +
  theme(legend.position = "bottom",
        plot.title = element_text(size = 14, face = "bold"),
        plot.subtitle = element_text(size = 12),
        strip.text = element_text(size = 12, face = "bold")) +  
  labs(title = "Evaluating Model 1's Performance: Predicted vs. Actual Returns",
       subtitle = "During the Non-COVID Period (Grouped by Security Type)",  
       x = "Predicted returns", 
       y = "Actual returns",
       color = "Security Type")


print(plot_m1_p1_type)

Group by Company size

# Apply the model particularly by company size
pred_size_p1_m1  <- cbind(X_test_data_p1_m1, company_size = task2_reg_p1_df1$company_size[-splitIndex_p1_m1])

data_size_p1_m1 <- data.frame(
  Predicted = pred_test_p1_m1,
  Actual = y_test_data_p1_m1,
  company_size = pred_size_p1_m1$company_size
)

# Calculate evaluation metrics to see the performance of model apply on company size
metrics_table_size_p1_m1 <- data_size_p1_m1 %>% 
  group_by(company_size) %>% 
  summarise(
    MSE = mean((Actual - Predicted)^2),
    RMSE = sqrt(MSE),
    MAE = mean(abs(Actual - Predicted)),
    residuals = sum((Actual - Predicted)^2),
    total = sum((Actual - mean(Actual))^2),
    R_Squared = 1 - (residuals / total)
  ) %>% 
  select(-c(residuals, total))

print(metrics_table_size_p1_m1)
## # A tibble: 3 × 5
##   company_size     MSE   RMSE    MAE R_Squared
##   <fct>          <dbl>  <dbl>  <dbl>     <dbl>
## 1 Medium       0.00171 0.0414 0.0317     0.167
## 2 Small        0.00197 0.0443 0.0317     0.138
## 3 Large        0.00185 0.0431 0.0328     0.129
# Plot the prediction vs actual by company size
plot_m1_p1_size <- ggplot(data = data_size_p1_m1, 
                     aes(x = Predicted, 
                         y = Actual,
                         color = company_size)) +
  geom_point(size = 2, alpha = 0.7) +
  geom_smooth(method = "lm", 
              se = FALSE, 
              color = "blue", 
              aes(linetype = "Regression Line")) + 
  scale_linetype_manual(name = NULL, 
                        values = "solid", 
                        labels = "Regression Line") + 
  theme_minimal() +
  theme(legend.position = "bottom",
        plot.title = element_text(size = 14, face = "bold"),
        plot.subtitle = element_text(size = 12),
        strip.text = element_text(size = 12, face = "bold")) +  
  labs(title = "Evaluating Model 1's Performance: Predicted vs. Actual Returns",
       subtitle = "During the Non-COVID Period (Grouped by Company Size)",  
       x = "Predicted returns", 
       y = "Actual returns",
       color = "Company Size")


print(plot_m1_p1_size)

Model 2: Returns: 2020-02-14 to 2020-03-20 & X variables: before 2019-02-14

# Period 1 - Dataset 1
task2_p2_df1_1 <- transform_dataset_for_task2(task2, "2020-02-14", "2020-03-20", "2019-08-20", "2020-02-14", "both", FFF_beta_covid)
## Highest Date: [1] "2020-02-14"
## 
## Lowest Date: [1] "2020-03-18"
## 
## Number of PERMNO in the dataset that calculated returns:  5764 
## Number of PERMNO that are in the dataset that aggregate x variables:  5924 
## Number of PERMNO that are in the both datasets:  5763 
## Number of PERMNO that are in the final dataset:  5708
# Remove outlier for returns
task2_p2_df1_1 <- remove_outliers_all(task2_p2_df1_1, "returns")

# Inspect the distribution of Returns
print(
  ggplot(task2_p2_df1_1, aes(x = returns)) +
    geom_histogram(bins = 30, fill = "blue", color = "black", alpha = 0.7) +
    theme_minimal() +
    labs(title = "Histogram of Returns", x = "Returns", y = "Frequency")
)

cat("Sample size after remove outlier: ", length(unique(task2_p2_df1_1$PERMNO)))
## Sample size after remove outlier:  5235
# Create dummy variable for categorical variable
task2_reg_p2_df1 <- task2_p2_df1_1 %>%
  filter(!is.na(volatility)) %>% 
  mutate(company_size = factor(company_size, levels = unique(task2_p2_df1_1$company_size)),
         security_type = factor(security_type, levels = unique(task2_p2_df1_1$security_type)),
         SECTOR = factor(SECTOR, levels = unique(task2_p2_df1_1$SECTOR))) %>%
  bind_cols(as_tibble(model.matrix(~ company_size - 1, data = .))) %>%
  bind_cols(as_tibble(model.matrix(~ security_type - 1, data = .))) %>% 
  bind_cols(as_tibble(model.matrix(~ SECTOR - 1, data = .))) 
## tibble [5,190 × 57] (S3: tbl_df/tbl/data.frame)
##  $ PERMNO                      : chr [1:5190] "10026" "10028" "10032" "10044" ...
##  $ returns                     : num [1:5190] 0.5036 0.0106 0.9654 0.5507 1.0622 ...
##  $ SHRCD                       : int [1:5190] 11 11 11 11 11 11 11 73 11 11 ...
##  $ TICKER                      : chr [1:5190] "JJSF" "DGSE" "PLXS" "RMCF" ...
##  $ SECTOR                      : Factor w/ 12 levels "Consumer_Staples",..: 1 2 3 1 2 3 3 2 4 5 ...
##  $ company_size                : Factor w/ 3 levels "Medium","Small",..: 1 2 2 2 2 3 3 2 3 3 ...
##  $ VOL                         : num [1:5190] 11872557 9713299 20878422 1408090 42541489 ...
##  $ dollar_vol                  : num [1:5190] 2.20e+09 1.62e+07 1.50e+09 1.25e+07 9.85e+08 ...
##  $ bid_ask_spread              : num [1:5190] 0.001024 0.014651 0.000675 0.011492 0.000678 ...
##  $ turnover_ratio              : num [1:5190] 0.00507 0.00291 0.00577 0.00189 0.0092 ...
##  $ volatility                  : num [1:5190] 0.195 0.618 0.238 0.15 0.296 ...
##  $ sharpe_ratio                : num [1:5190] -137.3 -41.7 -109.6 -162.8 -88.8 ...
##  $ market_share                : num [1:5190] 0.0422 0.024 0.0359 0.3338 0.0776 ...
##  $ market_cap                  : num [1:5190] 4.36e+11 4.72e+09 2.55e+11 6.66e+09 1.09e+11 ...
##  $ abs_corr_sp                 : num [1:5190] 0.72 0.318 0.635 0.193 0.595 ...
##  $ security_type               : Factor w/ 2 levels "Stock","ETF": 1 1 1 1 1 1 1 2 1 1 ...
##  $ tracking_error              : num [1:5190] 0.0225 0.0523 0.0261 0.0403 0.0324 ...
##  $ log_vol                     : num [1:5190] 16.3 16.1 16.9 14.2 17.6 ...
##  $ log_dollar_vol              : num [1:5190] 21.5 16.6 21.1 16.3 20.7 ...
##  $ log_spread                  : num [1:5190] -6.88 -4.22 -7.3 -4.47 -7.29 ...
##  $ log_turnover_ratio          : num [1:5190] -5.28 -5.84 -5.15 -6.27 -4.69 ...
##  $ log_volatility              : num [1:5190] -1.637 -0.482 -1.434 -1.896 -1.218 ...
##  $ log_market_share            : num [1:5190] -3.16 -3.73 -3.33 -1.1 -2.56 ...
##  $ log_market_cap              : num [1:5190] 26.8 22.3 26.3 22.6 25.4 ...
##  $ log_tracking_error          : num [1:5190] -3.79 -2.95 -3.65 -3.21 -3.43 ...
##  $ vol_cap_ratio               : num [1:5190] 2.72e-05 2.06e-03 8.18e-05 2.11e-04 3.91e-04 ...
##  $ log_vol_cap_ratio           : num [1:5190] -10.51 -6.19 -9.41 -8.46 -7.85 ...
##  $ scaled_vol                  : num [1:5190, 1] -0.255 -0.26 -0.231 -0.282 -0.174 ...
##   ..- attr(*, "scaled:center")= num 1.09e+08
##   ..- attr(*, "scaled:scale")= num 3.8e+08
##  $ scaled_dollar_vol           : num [1:5190, 1] -0.0898 -0.1428 -0.1068 -0.1429 -0.1193 ...
##   ..- attr(*, "scaled:center")= num 5.9e+09
##   ..- attr(*, "scaled:scale")= num 4.12e+10
##  $ scaled_spread               : num [1:5190, 1] -0.38 0.871 -0.412 0.581 -0.411 ...
##   ..- attr(*, "scaled:center")= num 0.00516
##   ..- attr(*, "scaled:scale")= num 0.0109
##  $ scaled_turnover_ratio       : num [1:5190, 1] -0.0606 -0.0707 -0.0573 -0.0755 -0.0412 ...
##   ..- attr(*, "scaled:center")= num 0.018
##   ..- attr(*, "scaled:scale")= num 0.213
##  $ scaled_volatility           : num [1:5190, 1] -0.399 0.764 -0.279 -0.521 -0.121 ...
##   ..- attr(*, "scaled:center")= num 0.34
##   ..- attr(*, "scaled:scale")= num 0.364
##  $ scaled_sharpe_ratio         : num [1:5190, 1] 0.15 0.291 0.191 0.113 0.222 ...
##   ..- attr(*, "scaled:center")= num -240
##   ..- attr(*, "scaled:scale")= num 681
##  $ scaled_market_share         : num [1:5190, 1] -0.147 -0.149 -0.148 -0.117 -0.143 ...
##   ..- attr(*, "scaled:center")= num 1.45
##   ..- attr(*, "scaled:scale")= num 9.55
##  $ scaled_market_cap           : num [1:5190, 1] -0.0686 -0.1709 -0.1115 -0.1705 -0.1463 ...
##   ..- attr(*, "scaled:center")= num 7.25e+11
##   ..- attr(*, "scaled:scale")= num 4.22e+12
##  $ scaled_abs_corr_sp          : num [1:5190, 1] 0.5053 -0.9915 0.1883 -1.4588 0.0384 ...
##   ..- attr(*, "scaled:center")= num 0.584
##   ..- attr(*, "scaled:scale")= num 0.268
##  $ scaled_tracking_error       : num [1:5190, 1] -0.3752 0.5248 -0.2678 0.1619 -0.0769 ...
##   ..- attr(*, "scaled:center")= num 0.0349
##   ..- attr(*, "scaled:scale")= num 0.0331
##  $ Mkt_beta                    : num [1:5190] 0.0017 0.014261 0.014132 0.000384 0.003971 ...
##  $ SMB_beta                    : num [1:5190] 0.00197 -0.00226 0.00549 0.00192 0.01299 ...
##  $ HML_beta                    : num [1:5190] -0.000521 0.000203 0.000435 -0.002509 0.002974 ...
##  $ company_sizeMedium          : num [1:5190] 1 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:3] 1 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ company_size: chr "contr.treatment"
##  $ company_sizeSmall           : num [1:5190] 0 1 1 1 1 0 0 1 0 0 ...
##   ..- attr(*, "assign")= int [1:3] 1 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ company_size: chr "contr.treatment"
##  $ company_sizeLarge           : num [1:5190] 0 0 0 0 0 1 1 0 1 1 ...
##   ..- attr(*, "assign")= int [1:3] 1 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ company_size: chr "contr.treatment"
##  $ security_typeStock          : num [1:5190] 1 1 1 1 1 1 1 0 1 1 ...
##   ..- attr(*, "assign")= int [1:2] 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ security_type: chr "contr.treatment"
##  $ security_typeETF            : num [1:5190] 0 0 0 0 0 0 0 1 0 0 ...
##   ..- attr(*, "assign")= int [1:2] 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ security_type: chr "contr.treatment"
##  $ SECTORConsumer_Staples      : num [1:5190] 1 0 0 1 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORUnknown               : num [1:5190] 0 1 0 0 1 0 0 1 0 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORInformation_Technology: num [1:5190] 0 0 1 0 0 1 1 0 0 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORFinancials            : num [1:5190] 0 0 0 0 0 0 0 0 1 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORIndustrials           : num [1:5190] 0 0 0 0 0 0 0 0 0 1 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORHealth_Care           : num [1:5190] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORConsumer_Discretionary: num [1:5190] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORMaterials             : num [1:5190] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORReal_Estate           : num [1:5190] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORUtilities             : num [1:5190] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTOREnergy                : num [1:5190] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORCommunication_Services: num [1:5190] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
## # A tibble: 10 × 57
##    PERMNO returns SHRCD TICKER SECTOR             company_size    VOL dollar_vol
##    <chr>    <dbl> <int> <chr>  <fct>              <fct>         <dbl>      <dbl>
##  1 10026   0.504     11 JJSF   Consumer_Staples   Medium       1.19e7    2.20e 9
##  2 10028   0.0106    11 DGSE   Unknown            Small        9.71e6    1.62e 7
##  3 10032   0.965     11 PLXS   Information_Techn… Small        2.09e7    1.50e 9
##  4 10044   0.551     11 RMCF   Consumer_Staples   Small        1.41e6    1.25e 7
##  5 10051   1.06      11 HNGR   Unknown            Small        4.25e7    9.85e 8
##  6 10104   0.173     11 ORCL   Information_Techn… Large        1.37e9    7.47e10
##  7 10107   0.320     11 MSFT   Information_Techn… Large        3.00e9    4.57e11
##  8 10113   0.641     73 AADR   Unknown            Small        1.38e6    7.21e 7
##  9 10138   0.267     11 TROW   Financials         Large        1.21e8    1.46e10
## 10 10145   0.512     11 HON    Industrials        Large        3.35e8    5.77e10
## # ℹ 49 more variables: bid_ask_spread <dbl>, turnover_ratio <dbl>,
## #   volatility <dbl>, sharpe_ratio <dbl>, market_share <dbl>, market_cap <dbl>,
## #   abs_corr_sp <dbl>, security_type <fct>, tracking_error <dbl>,
## #   log_vol <dbl>, log_dollar_vol <dbl>, log_spread <dbl>,
## #   log_turnover_ratio <dbl>, log_volatility <dbl>, log_market_share <dbl>,
## #   log_market_cap <dbl>, log_tracking_error <dbl>, vol_cap_ratio <dbl>,
## #   log_vol_cap_ratio <dbl>, scaled_vol <dbl[,1]>, …

To make sure returns are accurately calculated

# Find the date with the most PERMNOs having the highest price change
highest_change_covid <- task2 %>%
  filter(date >= "2020-02-14" & date <= "2020-03-20") %>%
  group_by(PERMNO) %>%
  filter(PRC == max(PRC)) %>%
  ungroup() %>%
  group_by(date) %>%
  summarise(num_highest = n()) %>%
  arrange(desc(num_highest))

# Find the date with the most PERMNOs having the lowest price change
lowest_change_covid <- task2 %>%
  filter(date >= "2020-02-14" & date <= "2020-03-20") %>%
  group_by(PERMNO) %>%
  filter(PRC == min(PRC)) %>%
  ungroup() %>%
  group_by(date) %>%
  summarise(num_lowest = n()) %>%
  arrange(desc(num_lowest))

# Inspect the results
returns_covid <- task2 %>%
  filter(date == highest_change_covid$date[1] | date == lowest_change_covid$date[1]) %>% 
  group_by(PERMNO) %>% 
  summarise(max_change = PRC[date == highest_change_covid$date[1]],
    min_change = PRC[date == lowest_change_covid$date[1]],
    returns_covid = (max_change - min_change) / min_change) %>% 
    select(PERMNO, returns_covid)

comparison_covid <- returns_covid  %>%
  left_join(task2_reg_p2_df1, by = "PERMNO") %>%
  mutate(difference_covid = returns_covid != returns) %>%
  filter(difference_covid) 
## The date that most PERMNOs has highest price change: 2020-02-14
## The date that most PERMNOs has lowest price change: 2020-03-18
## Number of PERMNOs in the dataset: 5929
## Number of PERMNOs on the date that most PERMNOs has highest price change: 5762
## Number of PERMNOs on the date that most PERMNOs has lowest price change: 5711
## Number of PERMNO that should be include in the dataset but did not:  0
## That PERMNO is:

Correlation of variable

#Find the correlation between retruns and each potential explanatory variables
cor_p2_m1 <- cor(task2_reg_p2_df1%>% select(-c(PERMNO, SHRCD, TICKER, SECTOR, company_size, security_type)), use = "pairwise.complete.obs")

# Correlation over absolute 0.7 = high risk of multicollinearity
high_corr_p2_m1 <- which(abs(cor_p2_m1) > 0.7, arr.ind = TRUE)
high_corr_pairs_p2_m1 <- data.frame(
  Feature1 = rownames(cor_p2_m1)[high_corr_p2_m1[,1]],
  Feature2 = colnames(cor_p2_m1)[high_corr_p2_m1[,2]],
  Correlation = cor_p2_m1[high_corr_p2_m1]
)
high_corr_pairs_p2_m1 <- high_corr_pairs_p2_m1 %>% filter(Correlation < 1)

# Inspect the variables with multicollinearity
head(high_corr_pairs_p2_m1)
##                Feature1       Feature2 Correlation
## 1            log_spread bid_ask_spread   0.7156169
## 2        tracking_error     volatility   0.7661033
## 3        log_volatility     volatility   0.7777328
## 4    log_tracking_error     volatility   0.7001564
## 5 scaled_tracking_error     volatility   0.7661033
## 6    log_tracking_error    abs_corr_sp  -0.7654664
# Inspect all correlations
head(melt(cor_p2_m1))
##             Var1    Var2       value
## 1        returns returns  1.00000000
## 2            VOL returns  0.03233583
## 3     dollar_vol returns -0.02082236
## 4 bid_ask_spread returns  0.01411746
## 5 turnover_ratio returns  0.01368088
## 6     volatility returns  0.24193335

Build Model 2

# Splitting the dataset into training and testing sets
set.seed(781)  

# COVID Period
X_p2_m2 <- task2_reg_p2_df1 %>% 
  select(Mkt_beta, SMB_beta, HML_beta, log_volatility, log_spread, scaled_market_share, scaled_abs_corr_sp, scaled_dollar_vol, SECTORMaterials, SECTORIndustrials, SECTORFinancials, SECTORInformation_Technology, SECTORConsumer_Discretionary, SECTORHealth_Care, SECTOREnergy, SECTORUtilities, SECTORReal_Estate, SECTORCommunication_Services, SECTORUnknown)

y_p2_m2 <- task2_reg_p2_df1$returns

splitIndex_p2_m2 <- createDataPartition(y_p2_m2, 
                                        p = 0.8, 
                                        list = FALSE)
X_train_data_p2_m2 <- X_p2_m2[splitIndex_p2_m2, ]
X_test_data_p2_m2 <- X_p2_m2[-splitIndex_p2_m2, ]
y_train_data_p2_m2 <- y_p2_m2[splitIndex_p2_m2]
y_test_data_p2_m2 <- y_p2_m2[-splitIndex_p2_m2]

train_data_p2_m2 <- cbind(X_train_data_p2_m2, returns = y_train_data_p2_m2)

# Build Model
model_2_p2 <- lm(returns ~ ., data = train_data_p2_m2)

# Prediction for training sample - Covid
pred_train_p2_m2 <- predict(model_2_p2, newdata = X_train_data_p2_m2)

metrics_train_p2_m2 <- postResample(pred = pred_train_p2_m2, 
                                    obs = train_data_p2_m2$returns)
metrics_table_train_p2_m2 <- data.frame(
  Metric = c('MSE', 'RMSE', 'MAE', 'R-squared'),
  Value = c(metrics_train_p2_m2["RMSE"]^2,
            metrics_train_p2_m2["RMSE"], 
            metrics_train_p2_m2["MAE"], 
            metrics_train_p2_m2["Rsquared"])
)

# Prediction for testing sample - covid
pred_test_p2_m2 <- predict(model_2_p2, newdata = X_test_data_p2_m2)
metrics_test_p2_m2 <- postResample(pred = pred_test_p2_m2, 
                                   obs = y_test_data_p2_m2)
metrics_table_test_p2_m2 <- data.frame(
  Metric = c('MSE', 'RMSE', 'MAE', 'R-squared'),
  Value = c(metrics_test_p2_m2["RMSE"]^2
, metrics_test_p2_m2["RMSE"], metrics_test_p2_m2["MAE"], metrics_test_p2_m2["Rsquared"])
)
## Performance Metrics for the Training Dataset:
##      Metric     Value
## 1       MSE 0.1240295
## 2      RMSE 0.3521782
## 3       MAE 0.2591119
## 4 R-squared 0.2638832
## Performance Metrics for the Testing Dataset:
##      Metric     Value
## 1       MSE 0.1077629
## 2      RMSE 0.3282726
## 3       MAE 0.2462835
## 4 R-squared 0.2922223
## Regression coefficient of Model 1:
## 
## Call:
## lm(formula = returns ~ ., data = train_data_p2_m2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.40345 -0.19946 -0.03503  0.16097  1.28868 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   0.591175   0.047373  12.479  < 2e-16 ***
## Mkt_beta                     11.393483   0.972606  11.714  < 2e-16 ***
## SMB_beta                      4.398360   0.644666   6.823 1.02e-11 ***
## HML_beta                      3.126379   0.852140   3.669 0.000247 ***
## log_volatility                0.139262   0.007828  17.790  < 2e-16 ***
## log_spread                    0.005219   0.005039   1.036 0.300402    
## scaled_market_share          -0.006909   0.005626  -1.228 0.219476    
## scaled_abs_corr_sp            0.061086   0.006986   8.744  < 2e-16 ***
## scaled_dollar_vol            -0.007637   0.005266  -1.450 0.147055    
## SECTORMaterials               0.141054   0.053298   2.647 0.008163 ** 
## SECTORIndustrials             0.139443   0.040457   3.447 0.000573 ***
## SECTORFinancials              0.193201   0.039455   4.897 1.01e-06 ***
## SECTORInformation_Technology  0.121068   0.040387   2.998 0.002737 ** 
## SECTORConsumer_Discretionary  0.302509   0.042456   7.125 1.22e-12 ***
## SECTORHealth_Care             0.034379   0.038975   0.882 0.377779    
## SECTOREnergy                  0.209086   0.061752   3.386 0.000716 ***
## SECTORUtilities               0.030553   0.061258   0.499 0.617980    
## SECTORReal_Estate             0.312307   0.084514   3.695 0.000222 ***
## SECTORCommunication_Services  0.229145   0.053253   4.303 1.72e-05 ***
## SECTORUnknown                 0.118594   0.036009   3.293 0.000998 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.353 on 4134 degrees of freedom
## Multiple R-squared:  0.2639, Adjusted R-squared:  0.2605 
## F-statistic:    78 on 19 and 4134 DF,  p-value: < 2.2e-16
## To make sure the variable are not multicollinear (VIF < 5)
##                     Mkt_beta                     SMB_beta 
##                     1.389091                     1.233495 
##                     HML_beta               log_volatility 
##                     1.172159                     2.112182 
##                   log_spread          scaled_market_share 
##                     1.751305                     1.127001 
##           scaled_abs_corr_sp            scaled_dollar_vol 
##                     1.618946                     1.147129 
##              SECTORMaterials            SECTORIndustrials 
##                     1.678586                     3.361116 
##             SECTORFinancials SECTORInformation_Technology 
##                     4.065494                     3.507963 
## SECTORConsumer_Discretionary            SECTORHealth_Care 
##                     2.740303                     4.523545 
##                 SECTOREnergy              SECTORUtilities 
##                     1.511425                     1.457989 
##            SECTORReal_Estate SECTORCommunication_Services 
##                     1.197435                     1.697703 
##                SECTORUnknown 
##                    10.732340
## Despite SectorUnknown has a VIF over 5, considering it is actually the group of data with missing value in Sector, while some other industry categories are important to the model. After trial and error, it is better to keep the industry variable in the model.
# Scatterplot of Prediction vs Actual
plot_data_p2_m2 <- data.frame(
  Predicted = pred_test_p2_m2,
  Actual = y_test_data_p2_m2
)

plot_m2_p2 <- ggplot(data = plot_data_p2_m2,
                     aes(x = Predicted,
                         y = Actual)) +
  geom_point() +
  geom_smooth(method = "lm",
              se = FALSE,
              color = "blue",
              aes(linetype = "Regression Line")) +
  scale_linetype_manual(name = NULL,
                        values = "solid",
                        labels = "Regression Line") +
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(title = "Evaluating Model 2's Performance: Predicted vs. Actual Returns in the Non-covid Period",
       x = "Predicted returns",
       y = "Actual returns")

print(plot_m2_p2)

Important variable

# Extract coefficients
coef_m2 <- coef(model_2_p2)

importance_m2 <- data.frame(
  Factors = names(coef_m2),
  Coefficient = coef_m2
)
importance_m2 <- importance_m2[importance_m2$Factor != "(Intercept)", ]
# Plot with positive and negative impacts
plot_m2_p2_factors <- ggplot(importance_m2, aes(x = reorder(Factors, Coefficient), y = Coefficient, fill = Coefficient > 0)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = round(Coefficient, 3)),
            vjust = ifelse(importance_m2$Coefficient > 0, 0.5, 0.3),
            color = "black") +
  coord_flip() +
  labs(title = "Impact of factors on returns for Model 2",
       x = "Factors",
       y = "Coefficient") +
  theme_minimal() +
  scale_fill_manual(
    values = c("TRUE" = "steelblue", "FALSE" = "coral"),
    labels = c("TRUE" = "Positive Impact", 
               "FALSE" = "Negative Impact"),
    name = ""
  ) +
  theme(
    axis.title.y = element_text(color = 'black'),
    axis.text.y = element_text(color = 'black'),
    legend.position = 'bottom'
  ) 

Group by Sector

pred_sector_p2_m2  <- cbind(X_test_data_p2_m2, SECTOR = task2_reg_p2_df1$SECTOR[-splitIndex_p2_m2])

data_sector_p2_m2 <- data.frame(
  Predicted = pred_test_p2_m2,
  Actual = y_test_data_p2_m2,
  SECTOR = pred_sector_p2_m2$SECTOR
)

data_sector_p2_m2$SECTOR <- as.character(data_sector_p2_m2$SECTOR)

# Calculating performance metrics by sector
metrics_table_sector_p2_m2 <- data_sector_p2_m2 %>% 
  group_by(SECTOR) %>% 
  summarise(
    MSE = mean((Actual - Predicted)^2),
    RMSE = sqrt(MSE),
    MAE = mean(abs(Actual - Predicted)),
    residuals = sum((Actual - Predicted)^2),
    total = sum((Actual - mean(Actual))^2),
    R_Squared = 1 - (residuals / total)
  ) %>% 
  select(-c(residuals, total))

print(metrics_table_sector_p2_m2)
## # A tibble: 12 × 5
##    SECTOR                    MSE  RMSE   MAE R_Squared
##    <chr>                   <dbl> <dbl> <dbl>     <dbl>
##  1 Communication_Services 0.129  0.359 0.291 -0.149   
##  2 Consumer_Discretionary 0.137  0.370 0.310  0.126   
##  3 Consumer_Staples       0.156  0.395 0.321  0.192   
##  4 Energy                 0.227  0.476 0.433  0.0855  
##  5 Financials             0.0965 0.311 0.248  0.0482  
##  6 Health_Care            0.184  0.428 0.339  0.0978  
##  7 Industrials            0.158  0.398 0.331  0.0263  
##  8 Information_Technology 0.127  0.357 0.287 -0.000730
##  9 Materials              0.125  0.353 0.276 -0.399   
## 10 Real_Estate            0.0692 0.263 0.209  0.240   
## 11 Unknown                0.0824 0.287 0.201  0.366   
## 12 Utilities              0.0597 0.244 0.225  0.000703
plot_m2_p2_sector <- ggplot(data = data_sector_p2_m2, 
                     aes(x = Predicted, 
                         y = Actual,
                         color = SECTOR)) +
  geom_point(size = 2) +
  geom_smooth(method = "lm", 
              se = FALSE, 
              color = "blue", 
              aes(linetype = "Regression Line")) + 
  scale_linetype_manual(name = NULL, 
                        values = "solid", 
                        labels = "Regression Line") + 
  theme_minimal() +
  theme(legend.position = "bottom",
        plot.title = element_text(size = 14, face = "bold"),
        plot.subtitle = element_text(size = 12),
        strip.text = element_text(size = 12, face = "bold")) +  
  labs(title = "Evaluating Model 2's Performance: Predicted vs. Actual Returns",
       subtitle = "During the COVID Period (Grouped by Industry Sector)",  
       x = "Predicted returns", 
       y = "Actual returns",
       color = "Industry Sector")


print(plot_m2_p2_sector)

Group by Security Type

pred_type_p2_m2  <- cbind(X_test_data_p2_m2, security_type = task2_reg_p2_df1$security_type[-splitIndex_p2_m2])

data_type_p2_m2 <- data.frame(
  Predicted = pred_test_p2_m2,
  Actual = y_test_data_p2_m2,
  security_type = pred_type_p2_m2$security_type
)

# Calculating performance metrics by sector
metrics_table_type_p2_m2 <- data_type_p2_m2 %>% 
  group_by(security_type) %>% 
  summarise(
    MSE = mean((Actual - Predicted)^2),
    RMSE = sqrt(MSE),
    MAE = mean(abs(Actual - Predicted)),
    residuals = sum((Actual - Predicted)^2),
    total = sum((Actual - mean(Actual))^2),
    R_Squared = 1 - (residuals / total)
  ) %>% 
  select(-c(residuals, total))

print(metrics_table_type_p2_m2)
## # A tibble: 2 × 5
##   security_type    MSE  RMSE   MAE R_Squared
##   <fct>          <dbl> <dbl> <dbl>     <dbl>
## 1 Stock         0.141  0.375 0.301     0.159
## 2 ETF           0.0555 0.236 0.160     0.378
plot_m2_p2_type <- ggplot(data = data_type_p2_m2, 
                     aes(x = Predicted, 
                         y = Actual,
                         color = security_type)) +
  geom_point(size = 2) +
  geom_smooth(method = "lm", 
              se = FALSE, 
              color = "blue", 
              aes(linetype = "Regression Line")) + 
  scale_linetype_manual(name = NULL, 
                        values = "solid", 
                        labels = "Regression Line") + 
  theme_minimal() +
  theme(legend.position = "bottom",
        plot.title = element_text(size = 14, face = "bold"),
        plot.subtitle = element_text(size = 12),
        strip.text = element_text(size = 12, face = "bold")) +  
  labs(title = "Evaluating Model 2's Performance: Predicted vs. Actual Returns",
       subtitle = "During the COVID Period (Grouped by Security Type)",  
       x = "Predicted returns", 
       y = "Actual returns",
       color = "Security Type")


print(plot_m2_p2_type)

Group by Company size

pred_size_p2_m2  <- cbind(X_test_data_p2_m2, company_size = task2_reg_p2_df1$company_size[-splitIndex_p2_m2])

data_size_p2_m2 <- data.frame(
  Predicted = pred_test_p2_m2,
  Actual = y_test_data_p2_m2,
  company_size = pred_size_p2_m2$company_size
)

# Calculating performance metrics by sector
metrics_table_size_p2_m2 <- data_size_p2_m2 %>% 
  group_by(company_size) %>% 
  summarise(
    MSE = mean((Actual - Predicted)^2),
    RMSE = sqrt(MSE),
    MAE = mean(abs(Actual - Predicted)),
    residuals = sum((Actual - Predicted)^2),
    total = sum((Actual - mean(Actual))^2),
    R_Squared = 1 - (residuals / total)
  ) %>% 
  select(-c(residuals, total))

print(metrics_table_size_p2_m2)
## # A tibble: 3 × 5
##   company_size    MSE  RMSE   MAE R_Squared
##   <fct>         <dbl> <dbl> <dbl>     <dbl>
## 1 Medium       0.108  0.329 0.260     0.265
## 2 Small        0.111  0.334 0.247     0.285
## 3 Large        0.0726 0.269 0.214     0.371
plot_m2_p2_size <- ggplot(data = data_size_p2_m2, 
                     aes(x = Predicted, 
                         y = Actual,
                         color = company_size)) +
  geom_point(size = 2, alpha = 0.7) +
  geom_smooth(method = "lm", 
              se = FALSE, 
              color = "blue", 
              aes(linetype = "Regression Line")) + 
  scale_linetype_manual(name = NULL, 
                        values = "solid", 
                        labels = "Regression Line") + 
  theme_minimal() +
  theme(legend.position = "bottom",
        plot.title = element_text(size = 14, face = "bold"),
        plot.subtitle = element_text(size = 12),
        strip.text = element_text(size = 12, face = "bold")) +  
  labs(title = "Evaluating Model 2's Performance: Predicted vs. Actual Returns",
       subtitle = "During the COVID Period (Grouped by Company Size)",  
       x = "Predicted returns", 
       y = "Actual returns",
       color = "Company Size")


print(plot_m2_p2_size)

Model 3: (ETFs) Returns: 2019-12-14 to 2020-01-20 & X variables: before 2019-02-14

# Period 1 - Dataset 2
task2_p1_df2_1 <- transform_dataset_for_task2(task2, "2019-12-14", "2020-01-20", "2019-08-20", "2019-12-14", "ETFs", FFF_beta_noncovid) 
## Highest Date: [1] "2020-01-17"
## 
## Lowest Date: [1] "2019-12-16"
## 
## Number of PERMNO in the dataset that calculated returns:  5790 
## Number of PERMNO that are in the dataset that aggregate x variables:  2180 
## Number of PERMNO that are in the both datasets:  2148 
## Number of PERMNO that are in the final dataset:  2129
# Remove outlier for returns
task2_p1_df2_1 <- remove_outliers_all(task2_p1_df2_1, "returns")

# Inspect the distribution of Returns
print(
  ggplot(task2_p1_df2_1, aes(x = returns)) +
    geom_histogram(bins = 30, fill = "blue", color = "black", alpha = 0.7) +
    theme_minimal() +
    labs(title = "Histogram of Returns", x = "Returns", y = "Frequency")
)

cat("Sample size after remove outlier: ", length(unique(task2_p1_df2_1$PERMNO)))
## Sample size after remove outlier:  1990
# Create dummy variable for categorical variable
task2_reg_p1_df2 <- task2_p1_df2_1 %>%
  filter(!is.na(volatility)) %>% 
  mutate(company_size = factor(company_size, levels = unique(task2_p1_df2_1$company_size)),
         SECTOR = factor(SECTOR, levels = unique(task2_p1_df2_1$SECTOR))) %>%
  bind_cols(as_tibble(model.matrix(~ company_size - 1, data = .))) %>%
  bind_cols(as_tibble(model.matrix(~ SECTOR - 1, data = .))) 
## tibble [1,964 × 49] (S3: tbl_df/tbl/data.frame)
##  $ PERMNO                      : chr [1:1964] "10113" "11182" "11264" "11996" ...
##  $ returns                     : num [1:1964] 0.05341 0.01247 0.09027 0.00653 0.06506 ...
##  $ SHRCD                       : int [1:1964] 73 73 73 73 73 73 73 73 73 73 ...
##  $ TICKER                      : chr [1:1964] "AADR" "GASL" "RETL" "EMLC" ...
##  $ SECTOR                      : Factor w/ 6 levels "Unknown","Financials",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ company_size                : Factor w/ 3 levels "Small","Medium",..: 1 1 1 2 2 2 2 2 1 1 ...
##  $ VOL                         : num [1:1964] 6.25e+05 1.58e+08 3.03e+06 1.79e+08 1.75e+09 ...
##  $ dollar_vol                  : num [1:1964] 3.09e+07 1.21e+09 7.16e+07 5.95e+09 1.49e+10 ...
##  $ bid_ask_spread              : num [1:1964] 0.003669 0.001949 0.00127 0.000446 0.001215 ...
##  $ turnover_ratio              : num [1:1964] 0.0036 0.4615 0.0612 0.0147 0.0234 ...
##  $ volatility                  : num [1:1964] 0.1116 1.2453 0.5851 0.0666 0.1401 ...
##  $ sharpe_ratio                : num [1:1964] -228.4 -20.2 -42.9 -381.5 -187 ...
##  $ market_share                : num [1:1964] 0.00027 0.19917 0.05839 0.11148 1.2461 ...
##  $ market_cap                  : num [1:1964] 8.59e+09 2.70e+09 1.17e+09 4.05e+11 6.44e+11 ...
##  $ abs_corr_sp                 : num [1:1964] 0.834 0.578 0.841 0.638 0.605 ...
##  $ security_type               : chr [1:1964] "ETF" "ETF" "ETF" "ETF" ...
##  $ tracking_error              : num [1:1964] 0.012 0.1097 0.0557 0.0165 0.0305 ...
##  $ log_vol                     : num [1:1964] 13.3 18.9 14.9 19 21.3 ...
##  $ log_dollar_vol              : num [1:1964] 17.2 20.9 18.1 22.5 23.4 ...
##  $ log_spread                  : num [1:1964] -5.61 -6.24 -6.67 -7.71 -6.71 ...
##  $ log_turnover_ratio          : num [1:1964] -5.626 -0.773 -2.794 -4.221 -3.756 ...
##  $ log_volatility              : num [1:1964] -2.192 0.219 -0.536 -2.709 -1.965 ...
##  $ log_market_share            : num [1:1964] -8.21 -1.61 -2.84 -2.19 0.22 ...
##  $ log_market_cap              : num [1:1964] 22.9 21.7 20.9 26.7 27.2 ...
##  $ log_tracking_error          : num [1:1964] -4.42 -2.21 -2.89 -4.1 -3.49 ...
##  $ vol_cap_ratio               : num [1:1964] 7.28e-05 5.85e-02 2.58e-03 4.41e-04 2.71e-03 ...
##  $ log_vol_cap_ratio           : num [1:1964] -9.53 -2.84 -5.96 -7.73 -5.91 ...
##  $ scaled_vol                  : num [1:1964, 1] -0.171 0.435 -0.162 0.516 6.569 ...
##   ..- attr(*, "scaled:center")= num 44874771
##   ..- attr(*, "scaled:scale")= num 2.59e+08
##  $ scaled_dollar_vol           : num [1:1964, 1] -0.0852 -0.0517 -0.084 0.0828 0.3359 ...
##   ..- attr(*, "scaled:center")= num 3.03e+09
##   ..- attr(*, "scaled:scale")= num 3.52e+10
##  $ scaled_spread               : num [1:1964, 1] 0.5702 -0.0831 -0.3413 -0.654 -0.3622 ...
##   ..- attr(*, "scaled:center")= num 0.00217
##   ..- attr(*, "scaled:scale")= num 0.00263
##  $ scaled_turnover_ratio       : num [1:1964, 1] -0.2741 7.5801 0.7132 -0.0839 0.065 ...
##   ..- attr(*, "scaled:center")= num 0.0196
##   ..- attr(*, "scaled:scale")= num 0.0583
##  $ scaled_volatility           : num [1:1964, 1] -0.1487 9.715 3.9712 -0.5408 0.0992 ...
##   ..- attr(*, "scaled:center")= num 0.129
##   ..- attr(*, "scaled:scale")= num 0.115
##  $ scaled_sharpe_ratio         : num [1:1964, 1] 0.2163 0.4132 0.3918 0.0716 0.2555 ...
##   ..- attr(*, "scaled:center")= num -457
##   ..- attr(*, "scaled:scale")= num 1057
##  $ scaled_market_share         : num [1:1964, 1] -0.1813 -0.0229 -0.135 -0.0927 0.8103 ...
##   ..- attr(*, "scaled:center")= num 0.228
##   ..- attr(*, "scaled:scale")= num 1.26
##  $ scaled_market_cap           : num [1:1964, 1] -0.179 -0.186 -0.188 0.309 0.603 ...
##   ..- attr(*, "scaled:center")= num 1.54e+11
##   ..- attr(*, "scaled:scale")= num 8.12e+11
##  $ scaled_abs_corr_sp          : num [1:1964, 1] 0.335 -0.654 0.363 -0.42 -0.55 ...
##   ..- attr(*, "scaled:center")= num 0.747
##   ..- attr(*, "scaled:scale")= num 0.259
##  $ scaled_tracking_error       : num [1:1964, 1] -0.29246 5.86143 2.45606 -0.00806 0.86839 ...
##   ..- attr(*, "scaled:center")= num 0.0167
##   ..- attr(*, "scaled:scale")= num 0.0159
##  $ Mkt_beta                    : num [1:1964] 0.0065 0.03727 0.03821 0.00104 0.00481 ...
##  $ SMB_beta                    : num [1:1964] -0.00132 0.04669 0.012 -0.00107 0.00296 ...
##  $ HML_beta                    : num [1:1964] -0.00489 0.059424 0.025013 -0.000253 0.004463 ...
##  $ company_sizeSmall           : num [1:1964] 1 1 1 0 0 0 0 0 1 1 ...
##   ..- attr(*, "assign")= int [1:3] 1 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ company_size: chr "contr.treatment"
##  $ company_sizeMedium          : num [1:1964] 0 0 0 1 1 1 1 1 0 0 ...
##   ..- attr(*, "assign")= int [1:3] 1 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ company_size: chr "contr.treatment"
##  $ company_sizeLarge           : num [1:1964] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:3] 1 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ company_size: chr "contr.treatment"
##  $ SECTORUnknown               : num [1:1964] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "assign")= int [1:6] 1 1 1 1 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORFinancials            : num [1:1964] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:6] 1 1 1 1 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORInformation_Technology: num [1:1964] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:6] 1 1 1 1 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORCommunication_Services: num [1:1964] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:6] 1 1 1 1 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORIndustrials           : num [1:1964] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:6] 1 1 1 1 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORHealth_Care           : num [1:1964] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:6] 1 1 1 1 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
## # A tibble: 10 × 49
##    PERMNO returns SHRCD TICKER SECTOR  company_size        VOL   dollar_vol
##    <chr>    <dbl> <int> <chr>  <fct>   <fct>             <dbl>        <dbl>
##  1 10113  0.0534     73 AADR   Unknown Small            625314    30943487.
##  2 11182  0.0125     73 GASL   Unknown Small         157637284  1211479740.
##  3 11264  0.0903     73 RETL   Unknown Small           3030234    71590797.
##  4 11996  0.00653    73 EMLC   Unknown Medium        178576800  5947066455.
##  5 12035  0.0651     73 AMLP   Unknown Medium       1745883591 14865575916.
##  6 12059  0.00473    73 SCHR   Unknown Medium         34034683  1886731858.
##  7 12064  0.00159    73 SCHO   Unknown Medium         79027981  3996375262.
##  8 12065  0.00264    73 SCHP   Unknown Medium         59689638  3393526567.
##  9 12075  0.0101     73 ELD    Unknown Small           2743109    95351320.
## 10 12098  0.0453     73 ECON   Unknown Small           2447440    54618394.
## # ℹ 41 more variables: bid_ask_spread <dbl>, turnover_ratio <dbl>,
## #   volatility <dbl>, sharpe_ratio <dbl>, market_share <dbl>, market_cap <dbl>,
## #   abs_corr_sp <dbl>, security_type <chr>, tracking_error <dbl>,
## #   log_vol <dbl>, log_dollar_vol <dbl>, log_spread <dbl>,
## #   log_turnover_ratio <dbl>, log_volatility <dbl>, log_market_share <dbl>,
## #   log_market_cap <dbl>, log_tracking_error <dbl>, vol_cap_ratio <dbl>,
## #   log_vol_cap_ratio <dbl>, scaled_vol <dbl[,1]>, …

Correlation of variable

#Find the correlation between retruns and each potential explanatory variables
cor_p1_m3 <- cor(task2_reg_p1_df2%>% select(-c(PERMNO, SHRCD, TICKER, SECTOR, company_size, security_type)), use = "pairwise.complete.obs")

# Correlation over absolute 0.7 = high risk of multicollinearity
high_corr_p1_m3 <- which(abs(cor_p1_m3) > 0.7, arr.ind = TRUE)
high_corr_pairs_p1_m3 <- data.frame(
  Feature1 = rownames(cor_p1_m3)[high_corr_p1_m3[,1]],
  Feature2 = colnames(cor_p1_m3)[high_corr_p1_m3[,2]],
  Correlation = cor_p1_m3[high_corr_p1_m3]
)
high_corr_pairs_p1_m3 <- high_corr_pairs_p1_m3 %>% filter(Correlation < 1)

# Inspect the variables with multicollinearity
head(high_corr_pairs_p1_m3)
##            Feature1       Feature2 Correlation
## 1        market_cap     dollar_vol   0.7143606
## 2 scaled_market_cap     dollar_vol   0.7143606
## 3        log_spread bid_ask_spread   0.7674109
## 4    log_volatility     volatility   0.7653878
## 5        dollar_vol     market_cap   0.7143606
## 6 scaled_dollar_vol     market_cap   0.7143606
# Inspect all correlations
head(melt(cor_p1_m3))
##             Var1    Var2       value
## 1        returns returns  1.00000000
## 2            VOL returns  0.04421879
## 3     dollar_vol returns  0.03832168
## 4 bid_ask_spread returns  0.02658229
## 5 turnover_ratio returns -0.02357584
## 6     volatility returns  0.16668020

Build Model 3

# Splitting the dataset into training and testing sets
set.seed(847)  

# Non-COVID Period
X_p1_m3 <- task2_reg_p1_df2 %>% 
  select(Mkt_beta, log_dollar_vol, log_tracking_error, log_market_share, log_turnover_ratio, log_volatility)

y_p1_m3 <- task2_reg_p1_df2$returns

splitIndex_p1_m3 <- createDataPartition(y_p1_m3, 
                                        p = 0.8, 
                                        list = FALSE)
X_train_data_p1_m3 <- X_p1_m3[splitIndex_p1_m3, ]
X_test_data_p1_m3 <- X_p1_m3[-splitIndex_p1_m3, ]
y_train_data_p1_m3 <- y_p1_m3[splitIndex_p1_m3]
y_test_data_p1_m3 <- y_p1_m3[-splitIndex_p1_m3]

train_data_p1_m3 <- cbind(X_train_data_p1_m3, returns = y_train_data_p1_m3)

# Build Model
model_3_p1 <- lm(returns ~ ., data = train_data_p1_m3)

# Prediction for training sample
pred_train_p1_m3 <- predict(model_3_p1, newdata = X_train_data_p1_m3)

metrics_train_p1_m3 <- postResample(pred = pred_train_p1_m3, 
                                    obs = train_data_p1_m3$returns)
metrics_table_train_p1_m3 <- data.frame(
  Metric = c('MSE', 'RMSE', 'MAE', 'R-squared'),
  Value = c(metrics_train_p1_m3["RMSE"]^2,
            metrics_train_p1_m3["RMSE"], 
            metrics_train_p1_m3["MAE"], 
            metrics_train_p1_m3["Rsquared"])
)

# Prediction for testing sample
pred_test_p1_m3 <- predict(model_3_p1, newdata = X_test_data_p1_m3)
metrics_test_p1_m3 <- postResample(pred = pred_test_p1_m3, 
                                   obs = y_test_data_p1_m3)
metrics_table_test_p1_m3 <- data.frame(
  Metric = c('MSE', 'RMSE', 'MAE', 'R-squared'),
  Value = c(metrics_test_p1_m3["RMSE"]^2
, metrics_test_p1_m3["RMSE"], metrics_test_p1_m3["MAE"], metrics_test_p1_m3["Rsquared"])
)
## Performance Metrics for the Training Dataset:
##      Metric        Value
## 1       MSE 0.0005017145
## 2      RMSE 0.0223989841
## 3       MAE 0.0165767300
## 4 R-squared 0.2138294556
## Performance Metrics for the Testing Dataset:
##      Metric        Value
## 1       MSE 0.0003576866
## 2      RMSE 0.0189126033
## 3       MAE 0.0143617736
## 4 R-squared 0.3400332199
## Regression coefficient of Model 1:
## 
## Call:
## lm(formula = returns ~ ., data = train_data_p1_m3)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.080371 -0.012175 -0.000294  0.011288  0.072915 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         0.0111798  0.0077640   1.440   0.1501    
## Mkt_beta            0.7686952  0.1176976   6.531 8.80e-11 ***
## log_dollar_vol     -0.0005336  0.0003067  -1.740   0.0821 .  
## log_tracking_error -0.0093686  0.0010685  -8.768  < 2e-16 ***
## log_market_share    0.0012796  0.0002437   5.250 1.73e-07 ***
## log_turnover_ratio  0.0016244  0.0006359   2.555   0.0107 *  
## log_volatility      0.0052169  0.0008652   6.030 2.04e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02245 on 1565 degrees of freedom
## Multiple R-squared:  0.2138, Adjusted R-squared:  0.2108 
## F-statistic: 70.94 on 6 and 1565 DF,  p-value: < 2.2e-16
## To make sure the variable are not multicollinear (VIF < 5)
##           Mkt_beta     log_dollar_vol log_tracking_error   log_market_share 
##           1.572187           2.157164           1.414408           1.739820 
## log_turnover_ratio     log_volatility 
##           1.628724           1.479572
# Scatterplot of Prediction vs Actual
plot_data_p1_m3 <- data.frame(
  Predicted = pred_test_p1_m3,
  Actual = y_test_data_p1_m3
)

plot_m3_p1 <- ggplot(data = plot_data_p1_m3,
                     aes(x = Predicted,
                         y = Actual)) +
  geom_point() +
  geom_smooth(method = "lm",
              se = FALSE,
              color = "blue",
              aes(linetype = "Regression Line")) +
  scale_linetype_manual(name = NULL,
                        values = "solid",
                        labels = "Regression Line") +
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(title = "Evaluating Model 3's Performance: Predicted vs. Actual Returns in the Non-COVID Period",
       x = "Predicted returns",
       y = "Actual returns")

print(plot_m3_p1)

Important variable

# Extract coefficients
coef_m3 <- coef(model_3_p1)

importance_m3 <- data.frame(
  Factors = names(coef_m3),
  Coefficient = coef_m3
)
importance_m3 <- importance_m3[importance_m3$Factor != "(Intercept)", ]

# Plot with positive and negative impacts
plot_m3_p1_factors <- ggplot(importance_m3, aes(x = reorder(Factors, Coefficient), y = Coefficient, fill = Coefficient > 0)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = round(Coefficient, 3)),
            vjust = ifelse(importance_m3$Coefficient > 0, 0.5, 0.3),
            color = "black") +
  coord_flip() +
  labs(title = "Impact of factors on Returns for Model 3",
       x = "Factors",
       y = "Coefficient") +
  theme_minimal() +
  scale_fill_manual(
    values = c("TRUE" = "steelblue", "FALSE" = "coral"),
    labels = c("TRUE" = "Positive Impact", 
               "FALSE" = "Negative Impact"),
    name = ""
  ) +
  theme(
    axis.title.y = element_text(color = 'black'),
    axis.text.y = element_text(color = 'black'),
    legend.position = 'bottom'
  ) 

print(plot_m3_p1_factors)

Group by Sector

# Apply the model particularly by industry
pred_sector_p1_m3  <- cbind(X_test_data_p1_m3, SECTOR = task2_reg_p1_df2$SECTOR[-splitIndex_p1_m3])

data_sector_p1_m3 <- data.frame(
  Predicted = pred_test_p1_m3,
  Actual = y_test_data_p1_m3,
  SECTOR = pred_sector_p1_m3$SECTOR
)

data_sector_p1_m3$SECTOR <- as.character(data_sector_p1_m3$SECTOR)

# Calculate evaluation metrics to see the performance of model apply on SECTOR
metrics_table_sector_p1_m3 <- data_sector_p1_m3 %>% 
  group_by(SECTOR) %>% 
  summarise(
    MSE = mean((Actual - Predicted)^2),
    RMSE = sqrt(MSE),
    MAE = mean(abs(Actual - Predicted)),
    residuals = sum((Actual - Predicted)^2),
    total = sum((Actual - mean(Actual))^2),
    R_Squared = 1 - (residuals / total)
  ) %>% 
  select(-c(residuals, total)) 

print(metrics_table_sector_p1_m3)
## # A tibble: 3 × 5
##   SECTOR                       MSE    RMSE     MAE R_Squared
##   <chr>                      <dbl>   <dbl>   <dbl>     <dbl>
## 1 Communication_Services 0.000443  0.0210  0.0210   -Inf    
## 2 Information_Technology 0.0000188 0.00434 0.00434  -Inf    
## 3 Unknown                0.000358  0.0189  0.0144      0.333
# Plot the prediction vs actual by SECTOR
plot_m3_p1_sector <- ggplot(data = data_sector_p1_m3, 
                     aes(x = Predicted, 
                         y = Actual,
                         color = SECTOR)) +
  geom_point(size = 2) +
  geom_smooth(method = "lm", 
              se = FALSE, 
              color = "blue", 
              aes(linetype = "Regression Line")) + 
  scale_linetype_manual(name = NULL, 
                        values = "solid", 
                        labels = "Regression Line") + 
  theme_minimal() +
  theme(legend.position = "bottom",
        plot.title = element_text(size = 14, face = "bold"),
        plot.subtitle = element_text(size = 12),
        strip.text = element_text(size = 12, face = "bold")) +  
  labs(title = "Evaluating Model 3's Performance: Predicted vs. Actual Returns",
       subtitle = "During the Non-COVID Period (Grouped by Industry Sector)",  
       x = "Predicted returns", 
       y = "Actual returns",
       color = "Industry Sector")


print(plot_m3_p1_sector)

Group by Company size

# Apply the model particularly by company size
pred_size_p1_m3  <- cbind(X_test_data_p1_m3, company_size = task2_reg_p1_df2$company_size[-splitIndex_p1_m3])

data_size_p1_m3 <- data.frame(
  Predicted = pred_test_p1_m3,
  Actual = y_test_data_p1_m3,
  company_size = pred_size_p1_m3$company_size
)

# Calculate evaluation metrics to see the performance of model apply on company size
metrics_table_size_p1_m3 <- data_size_p1_m3 %>% 
  group_by(company_size) %>% 
  summarise(
    MSE = mean((Actual - Predicted)^2),
    RMSE = sqrt(MSE),
    MAE = mean(abs(Actual - Predicted)),
    residuals = sum((Actual - Predicted)^2),
    total = sum((Actual - mean(Actual))^2),
    R_Squared = 1 - (residuals / total)
  ) %>% 
  select(-c(residuals, total))

print(metrics_table_size_p1_m3)
## # A tibble: 3 × 5
##   company_size      MSE   RMSE     MAE R_Squared
##   <fct>           <dbl>  <dbl>   <dbl>     <dbl>
## 1 Small        0.000365 0.0191 0.0146      0.324
## 2 Medium       0.000374 0.0193 0.0142      0.303
## 3 Large        0.000189 0.0138 0.00937     0.495
# Plot the prediction vs actual by company size
plot_m3_p1_size <- ggplot(data = data_size_p1_m3, 
                     aes(x = Predicted, 
                         y = Actual,
                         color = company_size)) +
  geom_point(size = 2, alpha = 0.7) +
  geom_smooth(method = "lm", 
              se = FALSE, 
              color = "blue", 
              aes(linetype = "Regression Line")) + 
  scale_linetype_manual(name = NULL, 
                        values = "solid", 
                        labels = "Regression Line") + 
  theme_minimal() +
  theme(legend.position = "bottom",
        plot.title = element_text(size = 14, face = "bold"),
        plot.subtitle = element_text(size = 12),
        strip.text = element_text(size = 12, face = "bold")) +  
  labs(title = "Evaluating Model 3's Performance: Predicted vs. Actual Returns",
       subtitle = "During the Non-COVID Period (Grouped by Company Size)",  
       x = "Predicted returns", 
       y = "Actual returns",
       color = "Company Size")


print(plot_m3_p1_size)

Model 4: (ETF) Returns: 2020-02-14 to 2020-03-20 & X variables: before 2019-02-14

# Period 2 - Dataset 2
task2_p2_df2_1 <- transform_dataset_for_task2(task2, "2020-02-14", "2020-03-20", "2019-08-20", "2020-02-14", "ETFs", FFF_beta_covid)
## Highest Date: [1] "2020-02-14"
## 
## Lowest Date: [1] "2020-03-18"
## 
## Number of PERMNO in the dataset that calculated returns:  5764 
## Number of PERMNO that are in the dataset that aggregate x variables:  2223 
## Number of PERMNO that are in the both datasets:  2168 
## Number of PERMNO that are in the final dataset:  2127
# Remove outlier for returns
task2_p2_df2_1 <- remove_outliers_all(task2_p2_df2_1, "returns")

# Inspect the distribution of Returns
print(
  ggplot(task2_p2_df2_1, aes(x = returns)) +
    geom_histogram(bins = 30, fill = "blue", color = "black", alpha = 0.7) +
    theme_minimal() +
    labs(title = "Histogram of Returns", x = "Returns", y = "Frequency")
)

cat("Sample size after remove outlier: ", length(unique(task2_p2_df2_1$PERMNO)))
## Sample size after remove outlier:  1966
# Create dummy variable for categorical variable
task2_reg_p2_df2 <- task2_p2_df2_1 %>%
  filter(!is.na(volatility)) %>% 
  mutate(company_size = factor(company_size, levels = unique(task2_p2_df2_1$company_size)),
         SECTOR = factor(SECTOR, levels = unique(task2_p2_df2_1$SECTOR))) %>%
  bind_cols(as_tibble(model.matrix(~ company_size - 1, data = .))) %>%
  bind_cols(as_tibble(model.matrix(~ SECTOR - 1, data = .))) 

# Inspect any missing value in important variables
colSums(is.na(task2_reg_p2_df2))
##                       PERMNO                      returns 
##                            0                            0 
##                        SHRCD                       TICKER 
##                            0                            0 
##                       SECTOR                 company_size 
##                            0                            0 
##                          VOL                   dollar_vol 
##                            0                            0 
##               bid_ask_spread               turnover_ratio 
##                            0                            0 
##                   volatility                 sharpe_ratio 
##                            0                            0 
##                 market_share                   market_cap 
##                            0                            0 
##                  abs_corr_sp                security_type 
##                            0                            0 
##               tracking_error                      log_vol 
##                            0                            0 
##               log_dollar_vol                   log_spread 
##                            0                            0 
##           log_turnover_ratio               log_volatility 
##                            0                            0 
##             log_market_share               log_market_cap 
##                            0                            0 
##           log_tracking_error                vol_cap_ratio 
##                            0                            0 
##            log_vol_cap_ratio                              
##                            0                            0 
##                                                           
##                            0                            0 
##                                                           
##                            0                            0 
##                                                           
##                            0                            0 
##                                                           
##                            0                            0 
##                                                  Mkt_beta 
##                            0                            0 
##                     SMB_beta                     HML_beta 
##                            0                            0 
##            company_sizeSmall           company_sizeMedium 
##                            0                            0 
##            company_sizeLarge                SECTORUnknown 
##                            0                            0 
##             SECTORFinancials SECTORInformation_Technology 
##                            0                            0 
##            SECTORHealth_Care 
##                            0
## tibble [1,947 × 47] (S3: tbl_df/tbl/data.frame)
##  $ PERMNO                      : chr [1:1947] "10113" "11407" "11996" "12054" ...
##  $ returns                     : num [1:1947] 0.6411 0.656 0.2445 0.6555 -0.0236 ...
##  $ SHRCD                       : int [1:1947] 73 73 73 73 73 73 73 73 73 73 ...
##  $ TICKER                      : chr [1:1947] "AADR" "LIT" "EMLC" "SCIF" ...
##  $ SECTOR                      : Factor w/ 4 levels "Unknown","Financials",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ company_size                : Factor w/ 3 levels "Small","Medium",..: 1 1 2 1 2 2 2 1 1 1 ...
##  $ VOL                         : num [1:1947] 1.38e+06 2.18e+07 2.75e+08 5.28e+06 5.36e+07 ...
##  $ dollar_vol                  : num [1:1947] 7.21e+07 6.04e+08 9.19e+09 1.73e+08 2.97e+09 ...
##  $ bid_ask_spread              : num [1:1947] 0.004037 0.001298 0.000419 0.003274 0.000287 ...
##  $ turnover_ratio              : num [1:1947] 0.00535 0.0095 0.01524 0.01108 0.00496 ...
##  $ volatility                  : num [1:1947] 0.1065 0.1914 0.0576 0.1852 0.0368 ...
##  $ sharpe_ratio                : num [1:1947] -237 -137 -442 -140 -667 ...
##  $ market_share                : num [1:1947] 0.0011 4.8669 0.1553 0.1261 0.0696 ...
##  $ market_cap                  : num [1:1947] 1.34e+10 6.01e+10 6.04e+11 1.56e+10 5.98e+11 ...
##  $ abs_corr_sp                 : num [1:1947] 0.834 0.82 0.638 0.725 0.537 ...
##  $ security_type               : chr [1:1947] "ETF" "ETF" "ETF" "ETF" ...
##  $ tracking_error              : num [1:1947] 0.012 0.0157 0.0165 0.0179 0.0226 ...
##  $ log_vol                     : num [1:1947] 14.1 16.9 19.4 15.5 17.8 ...
##  $ log_dollar_vol              : num [1:1947] 18.1 20.2 22.9 19 21.8 ...
##  $ log_spread                  : num [1:1947] -5.51 -6.65 -7.77 -5.72 -8.15 ...
##  $ log_turnover_ratio          : num [1:1947] -5.23 -4.66 -4.18 -4.5 -5.31 ...
##  $ log_volatility              : num [1:1947] -2.24 -1.65 -2.85 -1.69 -3.3 ...
##  $ log_market_share            : num [1:1947] -6.81 1.58 -1.86 -2.07 -2.66 ...
##  $ log_market_cap              : num [1:1947] 23.3 24.8 27.1 23.5 27.1 ...
##  $ log_tracking_error          : num [1:1947] -4.42 -4.15 -4.1 -4.03 -3.79 ...
##  $ vol_cap_ratio               : num [1:1947] 1.03e-04 3.62e-04 4.55e-04 3.38e-04 8.97e-05 ...
##  $ log_vol_cap_ratio           : num [1:1947] -9.18 -7.92 -7.7 -7.99 -9.32 ...
##  $ scaled_vol                  : num [1:1947, 1] -0.1716 -0.1184 0.541 -0.1615 -0.0353 ...
##   ..- attr(*, "scaled:center")= num 67184777
##   ..- attr(*, "scaled:scale")= num 3.83e+08
##  $ scaled_dollar_vol           : num [1:1947, 1] -0.0835 -0.0738 0.0828 -0.0816 -0.0306 ...
##   ..- attr(*, "scaled:center")= num 4.65e+09
##   ..- attr(*, "scaled:scale")= num 5.48e+10
##  $ scaled_spread               : num [1:1947, 1] 0.726 -0.324 -0.661 0.434 -0.711 ...
##   ..- attr(*, "scaled:center")= num 0.00214
##   ..- attr(*, "scaled:scale")= num 0.00261
##  $ scaled_turnover_ratio       : num [1:1947, 1] -0.2626 -0.1877 -0.0841 -0.1593 -0.2695 ...
##   ..- attr(*, "scaled:center")= num 0.0199
##   ..- attr(*, "scaled:scale")= num 0.0554
##  $ scaled_volatility           : num [1:1947, 1] -0.144 0.641 -0.596 0.584 -0.789 ...
##   ..- attr(*, "scaled:center")= num 0.122
##   ..- attr(*, "scaled:scale")= num 0.108
##  $ scaled_sharpe_ratio         : num [1:1947, 1] 0.2254 0.3206 0.0325 0.3179 -0.1805 ...
##   ..- attr(*, "scaled:center")= num -476
##   ..- attr(*, "scaled:scale")= num 1059
##  $ scaled_market_share         : num [1:1947, 1] -0.178 2.55 -0.092 -0.108 -0.14 ...
##   ..- attr(*, "scaled:center")= num 0.319
##   ..- attr(*, "scaled:scale")= num 1.78
##  $ scaled_market_cap           : num [1:1947, 1] -0.176 -0.139 0.293 -0.174 0.288 ...
##   ..- attr(*, "scaled:center")= num 2.35e+11
##   ..- attr(*, "scaled:scale")= num 1.26e+12
##  $ scaled_abs_corr_sp          : num [1:1947, 1] 0.3314 0.28 -0.4246 -0.0899 -0.8161 ...
##   ..- attr(*, "scaled:center")= num 0.748
##   ..- attr(*, "scaled:scale")= num 0.258
##  $ scaled_tracking_error       : num [1:1947, 1] -0.29226 -0.05883 -0.00565 0.07711 0.37667 ...
##   ..- attr(*, "scaled:center")= num 0.0166
##   ..- attr(*, "scaled:scale")= num 0.0157
##  $ Mkt_beta                    : num [1:1947] 0.00699 0.01214 0.00149 0.00373 -0.00174 ...
##  $ SMB_beta                    : num [1:1947] -0.001252 0.003313 -0.000873 0.001297 -0.000373 ...
##  $ HML_beta                    : num [1:1947] -4.80e-03 2.80e-04 -4.41e-05 2.13e-03 -1.56e-03 ...
##  $ company_sizeSmall           : num [1:1947] 1 1 0 1 0 0 0 1 1 1 ...
##   ..- attr(*, "assign")= int [1:3] 1 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ company_size: chr "contr.treatment"
##  $ company_sizeMedium          : num [1:1947] 0 0 1 0 1 1 1 0 0 0 ...
##   ..- attr(*, "assign")= int [1:3] 1 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ company_size: chr "contr.treatment"
##  $ company_sizeLarge           : num [1:1947] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:3] 1 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ company_size: chr "contr.treatment"
##  $ SECTORUnknown               : num [1:1947] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "assign")= int [1:4] 1 1 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORFinancials            : num [1:1947] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:4] 1 1 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORInformation_Technology: num [1:1947] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:4] 1 1 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORHealth_Care           : num [1:1947] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:4] 1 1 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
## # A tibble: 10 × 47
##    PERMNO returns SHRCD TICKER SECTOR  company_size       VOL  dollar_vol
##    <chr>    <dbl> <int> <chr>  <fct>   <fct>            <dbl>       <dbl>
##  1 10113   0.641     73 AADR   Unknown Small          1377219   72137254.
##  2 11407   0.656     73 LIT    Unknown Small         21779548  603877429.
##  3 11996   0.244     73 EMLC   Unknown Medium       274606166 9192444608.
##  4 12054   0.656     73 SCIF   Unknown Small          5283845  173272234.
##  5 12059  -0.0236    73 SCHR   Unknown Medium        53634151 2970639004.
##  6 12064  -0.0158    73 SCHO   Unknown Medium       120765351 6106187629.
##  7 12065   0.0853    73 SCHP   Unknown Medium        87431310 4977006530.
##  8 12075   0.298     73 ELD    Unknown Small          4382473  153885205.
##  9 12098   0.341     73 ECON   Unknown Small          3895807   88874301.
## 10 12105   0.402     73 ENZL   Unknown Small          5536801  300116929.
## # ℹ 39 more variables: bid_ask_spread <dbl>, turnover_ratio <dbl>,
## #   volatility <dbl>, sharpe_ratio <dbl>, market_share <dbl>, market_cap <dbl>,
## #   abs_corr_sp <dbl>, security_type <chr>, tracking_error <dbl>,
## #   log_vol <dbl>, log_dollar_vol <dbl>, log_spread <dbl>,
## #   log_turnover_ratio <dbl>, log_volatility <dbl>, log_market_share <dbl>,
## #   log_market_cap <dbl>, log_tracking_error <dbl>, vol_cap_ratio <dbl>,
## #   log_vol_cap_ratio <dbl>, scaled_vol <dbl[,1]>, …

Correlation of variable

#Find the correlation between retruns and each potential explanatory variables
cor_p2_m4 <- cor(task2_reg_p2_df2%>% select(-c(PERMNO, SHRCD, TICKER, SECTOR, company_size, security_type)), use = "pairwise.complete.obs")

# Correlation over absolute 0.7 = high risk of multicollinearity
high_corr_p2_m4 <- which(abs(cor_p2_m4) > 0.7, arr.ind = TRUE)
high_corr_pairs_p2_m4 <- data.frame(
  Feature1 = rownames(cor_p2_m4)[high_corr_p2_m4[,1]],
  Feature2 = colnames(cor_p2_m4)[high_corr_p2_m4[,2]],
  Correlation = cor_p2_m4[high_corr_p2_m4]
)
high_corr_pairs_p2_m4 <- high_corr_pairs_p2_m4 %>% filter(Correlation < 1)

# Inspect the variables with multicollinearity
head(high_corr_pairs_p2_m4)
##            Feature1       Feature2 Correlation
## 1        market_cap     dollar_vol   0.7210814
## 2 scaled_market_cap     dollar_vol   0.7210814
## 3        log_spread bid_ask_spread   0.7265476
## 4     vol_cap_ratio turnover_ratio   0.7700479
## 5    log_volatility     volatility   0.8074679
## 6    log_volatility   sharpe_ratio   0.7121814
# Inspect all correlations
head(melt(cor_p2_m4))
##             Var1    Var2        value
## 1        returns returns  1.000000000
## 2            VOL returns -0.003687189
## 3     dollar_vol returns -0.005025484
## 4 bid_ask_spread returns  0.025407662
## 5 turnover_ratio returns -0.019845336
## 6     volatility returns  0.418324811

Build Model 4

# Splitting the dataset into training and testing sets
set.seed(847)  

# COVID Period
X_p2_m4 <- task2_reg_p2_df2 %>% 
  select(Mkt_beta, log_dollar_vol, log_tracking_error, log_turnover_ratio, log_volatility, scaled_abs_corr_sp)

y_p2_m4 <- task2_reg_p2_df2$returns

splitIndex_p2_m4 <- createDataPartition(y_p2_m4, 
                                        p = 0.8, 
                                        list = FALSE)
X_train_data_p2_m4 <- X_p2_m4[splitIndex_p2_m4, ]
X_test_data_p2_m4 <- X_p2_m4[-splitIndex_p2_m4, ]
y_train_data_p2_m4 <- y_p2_m4[splitIndex_p2_m4]
y_test_data_p2_m4 <- y_p2_m4[-splitIndex_p2_m4]

train_data_p2_m4 <- cbind(X_train_data_p2_m4, returns = y_train_data_p2_m4)

# Build Model
model_4_p2 <- lm(returns ~ ., data = train_data_p2_m4)

# Prediction for training sample
pred_train_p2_m4 <- predict(model_4_p2, newdata = X_train_data_p2_m4)

metrics_train_p2_m4 <- postResample(pred = pred_train_p2_m4, 
                                    obs = train_data_p2_m4$returns)
metrics_table_train_p2_m4 <- data.frame(
  Metric = c('MSE', 'RMSE', 'MAE', 'R-squared'),
  Value = c(metrics_train_p2_m4["RMSE"]^2,
            metrics_train_p2_m4["RMSE"], 
            metrics_train_p2_m4["MAE"], 
            metrics_train_p2_m4["Rsquared"])
)

# Prediction for testing sample
pred_test_p2_m4 <- predict(model_4_p2, newdata = X_test_data_p2_m4)
metrics_test_p2_m4 <- postResample(pred = pred_test_p2_m4, 
                                   obs = y_test_data_p2_m4)
metrics_table_test_p2_m4 <- data.frame(
  Metric = c('MSE', 'RMSE', 'MAE', 'R-squared'),
  Value = c(metrics_test_p2_m4["RMSE"]^2
, metrics_test_p2_m4["RMSE"], metrics_test_p2_m4["MAE"], metrics_test_p2_m4["Rsquared"])
)

print("Performance Metrics for the Training Dataset:")
## [1] "Performance Metrics for the Training Dataset:"
metrics_table_train_p2_m4
##      Metric      Value
## 1       MSE 0.03139148
## 2      RMSE 0.17717642
## 3       MAE 0.12423672
## 4 R-squared 0.50715611
print("Performance Metrics for the Testing Dataset:")
## [1] "Performance Metrics for the Testing Dataset:"
metrics_table_test_p2_m4
##      Metric      Value
## 1       MSE 0.02831917
## 2      RMSE 0.16828299
## 3       MAE 0.12242571
## 4 R-squared 0.55829343
# Summary of model
summary(model_4_p2)
## 
## Call:
## lm(formula = returns ~ ., data = train_data_p2_m4)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.97116 -0.08898 -0.00791  0.08081  0.88828 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         0.630172   0.059686  10.558  < 2e-16 ***
## Mkt_beta           19.898647   1.114378  17.856  < 2e-16 ***
## log_dollar_vol      0.006210   0.001951   3.183  0.00149 ** 
## log_tracking_error  0.102500   0.012439   8.240 3.62e-16 ***
## log_turnover_ratio -0.033037   0.005078  -6.506 1.04e-10 ***
## log_volatility      0.067368   0.007573   8.896  < 2e-16 ***
## scaled_abs_corr_sp  0.078958   0.007518  10.503  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1776 on 1552 degrees of freedom
## Multiple R-squared:  0.5072, Adjusted R-squared:  0.5053 
## F-statistic: 266.2 on 6 and 1552 DF,  p-value: < 2.2e-16
# To make sure the variable are not multicollinear (VIF < 5)
print(vif(model_4_p2))
##           Mkt_beta     log_dollar_vol log_tracking_error log_turnover_ratio 
##           1.901622           1.272908           2.537112           1.398859 
##     log_volatility scaled_abs_corr_sp 
##           1.798556           2.896959
## Performance Metrics for the Training Dataset:
##      Metric      Value
## 1       MSE 0.03139148
## 2      RMSE 0.17717642
## 3       MAE 0.12423672
## 4 R-squared 0.50715611
## Performance Metrics for the Testing Dataset:
##      Metric      Value
## 1       MSE 0.02831917
## 2      RMSE 0.16828299
## 3       MAE 0.12242571
## 4 R-squared 0.55829343
## Regression coefficient of Model 1:
## 
## Call:
## lm(formula = returns ~ ., data = train_data_p2_m4)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.97116 -0.08898 -0.00791  0.08081  0.88828 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         0.630172   0.059686  10.558  < 2e-16 ***
## Mkt_beta           19.898647   1.114378  17.856  < 2e-16 ***
## log_dollar_vol      0.006210   0.001951   3.183  0.00149 ** 
## log_tracking_error  0.102500   0.012439   8.240 3.62e-16 ***
## log_turnover_ratio -0.033037   0.005078  -6.506 1.04e-10 ***
## log_volatility      0.067368   0.007573   8.896  < 2e-16 ***
## scaled_abs_corr_sp  0.078958   0.007518  10.503  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1776 on 1552 degrees of freedom
## Multiple R-squared:  0.5072, Adjusted R-squared:  0.5053 
## F-statistic: 266.2 on 6 and 1552 DF,  p-value: < 2.2e-16
## To make sure the variable are not multicollinear (VIF < 5)
##           Mkt_beta     log_dollar_vol log_tracking_error log_turnover_ratio 
##           1.901622           1.272908           2.537112           1.398859 
##     log_volatility scaled_abs_corr_sp 
##           1.798556           2.896959
# Scatterplot of Prediction vs Actual
plot_data_p2_m4 <- data.frame(
  Predicted = pred_test_p2_m4,
  Actual = y_test_data_p2_m4
)

plot_m4_p2 <- ggplot(data = plot_data_p2_m4,
                     aes(x = Predicted,
                         y = Actual)) +
  geom_point() +
  geom_smooth(method = "lm",
              se = FALSE,
              color = "blue",
              aes(linetype = "Regression Line")) +
  scale_linetype_manual(name = NULL,
                        values = "solid",
                        labels = "Regression Line") +
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(title = "Evaluating Model 4's Performance: Predicted vs. Actual Returns in the COVID Period",
       x = "Predicted returns",
       y = "Actual returns")

print(plot_m4_p2)

Important variable

# Extract coefficients
coef_m4 <- coef(model_4_p2)

importance_m4 <- data.frame(
  Factors = names(coef_m4),
  Coefficient = coef_m4
)
importance_m4 <- importance_m4[importance_m4$Factor != "(Intercept)", ]

# Plot with positive and negative impacts
plot_m4_p2_factors <- ggplot(importance_m4, aes(x = reorder(Factors, Coefficient), y = Coefficient, fill = Coefficient > 0)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = round(Coefficient, 3)),
            vjust = ifelse(importance_m4$Coefficient > 0, 0.5, 0.3),
            color = "black") +
  coord_flip() +
  labs(title = "Impact of factors on Returns for Model 4",
       x = "Factors",
       y = "Coefficient") +
  theme_minimal() +
  scale_fill_manual(
    values = c("TRUE" = "steelblue", "FALSE" = "coral"),
    labels = c("TRUE" = "Positive Impact", 
               "FALSE" = "Negative Impact"),
    name = ""
  ) +
  theme(
    axis.title.y = element_text(color = 'black'),
    axis.text.y = element_text(color = 'black'),
    legend.position = 'bottom'
  ) 

print(plot_m4_p2_factors)

Group by Sector

# Apply the model particularly by industry
pred_sector_p2_m4  <- cbind(X_test_data_p2_m4, SECTOR = task2_reg_p2_df2$SECTOR[-splitIndex_p2_m4])

data_sector_p2_m4 <- data.frame(
  Predicted = pred_test_p2_m4,
  Actual = y_test_data_p2_m4,
  SECTOR = pred_sector_p2_m4$SECTOR
)

data_sector_p2_m4$SECTOR <- as.character(data_sector_p2_m4$SECTOR)

# Calculate evaluation metrics to see the performance of model apply on SECTOR
metrics_table_sector_p2_m4 <- data_sector_p2_m4 %>% 
  group_by(SECTOR) %>% 
  summarise(
    MSE = mean((Actual - Predicted)^2),
    RMSE = sqrt(MSE),
    MAE = mean(abs(Actual - Predicted)),
    residuals = sum((Actual - Predicted)^2),
    total = sum((Actual - mean(Actual))^2),
    R_Squared = 1 - (residuals / total)
  ) %>% 
  select(-c(residuals, total)) 

print(metrics_table_sector_p2_m4)
## # A tibble: 1 × 5
##   SECTOR     MSE  RMSE   MAE R_Squared
##   <chr>    <dbl> <dbl> <dbl>     <dbl>
## 1 Unknown 0.0283 0.168 0.122     0.555
# Plot the prediction vs actual by SECTOR
plot_m4_p2_sector <- ggplot(data = data_sector_p2_m4, 
                     aes(x = Predicted, 
                         y = Actual,
                         color = SECTOR)) +
  geom_point(size = 2) +
  geom_smooth(method = "lm", 
              se = FALSE, 
              color = "blue", 
              aes(linetype = "Regression Line")) + 
  scale_linetype_manual(name = NULL, 
                        values = "solid", 
                        labels = "Regression Line") + 
  theme_minimal() +
  theme(legend.position = "bottom",
        plot.title = element_text(size = 14, face = "bold"),
        plot.subtitle = element_text(size = 12),
        strip.text = element_text(size = 12, face = "bold")) +  
  labs(title = "Evaluating Model 4's Performance: Predicted vs. Actual Returns",
       subtitle = "During the COVID Period (Grouped by Industry Sector)",  
       x = "Predicted returns", 
       y = "Actual returns",
       color = "Industry Sector")


print(plot_m4_p2_sector)

Group by Company size

# Apply the model particularly by company size
pred_size_p2_m4  <- cbind(X_test_data_p2_m4, company_size = task2_reg_p2_df2$company_size[-splitIndex_p2_m4])

data_size_p2_m4 <- data.frame(
  Predicted = pred_test_p2_m4,
  Actual = y_test_data_p2_m4,
  company_size = pred_size_p2_m4$company_size
)

# Calculate evaluation metrics to see the performance of model apply on company size
metrics_table_size_p2_m4 <- data_size_p2_m4 %>% 
  group_by(company_size) %>% 
  summarise(
    MSE = mean((Actual - Predicted)^2),
    RMSE = sqrt(MSE),
    MAE = mean(abs(Actual - Predicted)),
    residuals = sum((Actual - Predicted)^2),
    total = sum((Actual - mean(Actual))^2),
    R_Squared = 1 - (residuals / total)
  ) %>% 
  select(-c(residuals, total))

print(metrics_table_size_p2_m4)
## # A tibble: 3 × 5
##   company_size    MSE  RMSE    MAE R_Squared
##   <fct>         <dbl> <dbl>  <dbl>     <dbl>
## 1 Small        0.0297 0.172 0.124      0.557
## 2 Medium       0.0245 0.157 0.127      0.389
## 3 Large        0.0145 0.120 0.0991     0.652
# Plot the prediction vs actual by company size
plot_m4_p2_size <- ggplot(data = data_size_p2_m4, 
                     aes(x = Predicted, 
                         y = Actual,
                         color = company_size)) +
  geom_point(size = 2, alpha = 0.7) +
  geom_smooth(method = "lm", 
              se = FALSE, 
              color = "blue", 
              aes(linetype = "Regression Line")) + 
  scale_linetype_manual(name = NULL, 
                        values = "solid", 
                        labels = "Regression Line") + 
  theme_minimal() +
  theme(legend.position = "bottom",
        plot.title = element_text(size = 14, face = "bold"),
        plot.subtitle = element_text(size = 12),
        strip.text = element_text(size = 12, face = "bold")) +  
  labs(title = "Evaluating Model 4's Performance: Predicted vs. Actual Returns",
       subtitle = "During the COVID Period (Grouped by Company Size)",  
       x = "Predicted returns", 
       y = "Actual returns",
       color = "Company Size")


print(plot_m4_p2_size)

Model 5: (Stocks) Returns: 2019-12-14 to 2020-01-20 & X variables: before 2019-02-14

# Period 1 - Dataset 3
task2_p1_df3_1 <- transform_dataset_for_task2(task2, "2019-12-14", "2020-01-20", "2019-08-20", "2019-12-14", "stocks", FFF_beta_noncovid) 
## Highest Date: [1] "2020-01-17"
## 
## Lowest Date: [1] "2019-12-16"
## 
## Number of PERMNO in the dataset that calculated returns:  5790 
## Number of PERMNO that are in the dataset that aggregate x variables:  3660 
## Number of PERMNO that are in the both datasets:  3598 
## Number of PERMNO that are in the final dataset:  3553
# Remove outlier for returns
task2_p1_df3_1 <- remove_outliers_all(task2_p1_df3_1, "returns")

# Inspect the distribution of Returns
print(
  ggplot(task2_p1_df3_1, aes(x = returns)) +
    geom_histogram(bins = 30, fill = "blue", color = "black", alpha = 0.7) +
    theme_minimal() +
    labs(title = "Histogram of Returns", x = "Returns", y = "Frequency")
)

cat("Sample size after remove outlier: ", length(unique(task2_p1_df3_1$PERMNO)))
## Sample size after remove outlier:  3173
# Create dummy variable for categorical variable
task2_reg_p1_df3 <- task2_p1_df3_1 %>%
  filter(!is.na(volatility)) %>% 
  mutate(company_size = factor(company_size, levels = unique(task2_p1_df3_1$company_size)),
         SECTOR = factor(SECTOR, levels = unique(task2_p1_df3_1$SECTOR))) %>%
  bind_cols(as_tibble(model.matrix(~ company_size - 1, data = .))) %>%
  bind_cols(as_tibble(model.matrix(~ SECTOR - 1, data = .))) 
## tibble [3,157 × 55] (S3: tbl_df/tbl/data.frame)
##  $ PERMNO                      : chr [1:3157] "10026" "10028" "10032" "10044" ...
##  $ returns                     : num [1:3157] 0.01925 0.1259 0.01087 -0.00613 -0.00417 ...
##  $ SHRCD                       : int [1:3157] 11 11 11 11 11 11 11 11 11 11 ...
##  $ TICKER                      : chr [1:3157] "JJSF" "DGSE" "PLXS" "RMCF" ...
##  $ SECTOR                      : Factor w/ 12 levels "Consumer_Staples",..: 1 2 3 1 2 3 3 4 5 5 ...
##  $ company_size                : Factor w/ 3 levels "Medium","Small",..: 1 2 2 2 2 3 3 3 3 2 ...
##  $ VOL                         : num [1:3157] 7586162 4650390 13170524 658814 32686198 ...
##  $ dollar_vol                  : num [1:3157] 1.44e+09 6.27e+06 9.09e+08 5.93e+06 7.24e+08 ...
##  $ bid_ask_spread              : num [1:3157] 0.00109 0.015923 0.000649 0.011499 0.000773 ...
##  $ turnover_ratio              : num [1:3157] 0.0049 0.00211 0.00551 0.00134 0.01069 ...
##  $ volatility                  : num [1:3157] 0.176 0.658 0.28 0.153 0.364 ...
##  $ sharpe_ratio                : num [1:3157] -143.4 -38.2 -91 -161.8 -68.9 ...
##  $ market_share                : num [1:3157] 0.0286 0 0.0252 0.2334 0.0562 ...
##  $ market_cap                  : num [1:3157] 2.94e+11 2.76e+09 1.62e+11 4.45e+09 6.75e+10 ...
##  $ abs_corr_sp                 : num [1:3157] 0.72 0.318 0.635 0.193 0.595 ...
##  $ security_type               : chr [1:3157] "Stock" "Stock" "Stock" "Stock" ...
##  $ tracking_error              : num [1:3157] 0.0225 0.0523 0.0261 0.0403 0.0324 ...
##  $ log_vol                     : num [1:3157] 15.8 15.4 16.4 13.4 17.3 ...
##  $ log_dollar_vol              : num [1:3157] 21.1 15.7 20.6 15.6 20.4 ...
##  $ log_spread                  : num [1:3157] -6.82 -4.14 -7.34 -4.47 -7.16 ...
##  $ log_turnover_ratio          : num [1:3157] -5.32 -6.16 -5.2 -6.61 -4.54 ...
##  $ log_volatility              : num [1:3157] -1.736 -0.418 -1.274 -1.878 -1.011 ...
##  $ log_market_share            : num [1:3157] -3.56 -13.82 -3.68 -1.46 -2.88 ...
##  $ log_market_cap              : num [1:3157] 26.4 21.7 25.8 22.2 24.9 ...
##  $ log_tracking_error          : num [1:3157] -3.79 -2.95 -3.65 -3.21 -3.43 ...
##  $ vol_cap_ratio               : num [1:3157] 2.58e-05 1.69e-03 8.15e-05 1.48e-04 4.84e-04 ...
##  $ log_vol_cap_ratio           : num [1:3157] -10.56 -6.38 -9.41 -8.82 -7.63 ...
##  $ scaled_vol                  : num [1:3157, 1] -0.325 -0.337 -0.302 -0.353 -0.221 ...
##   ..- attr(*, "scaled:center")= num 86487161
##   ..- attr(*, "scaled:scale")= num 2.43e+08
##  $ scaled_dollar_vol           : num [1:3157, 1] -0.162 -0.247 -0.194 -0.247 -0.205 ...
##   ..- attr(*, "scaled:center")= num 4.18e+09
##   ..- attr(*, "scaled:scale")= num 1.69e+10
##  $ scaled_spread               : num [1:3157, 1] -0.442 0.617 -0.473 0.301 -0.465 ...
##   ..- attr(*, "scaled:center")= num 0.00728
##   ..- attr(*, "scaled:scale")= num 0.014
##  $ scaled_turnover_ratio       : num [1:3157, 1] -0.0337 -0.0406 -0.0322 -0.0425 -0.0193 ...
##   ..- attr(*, "scaled:center")= num 0.0185
##   ..- attr(*, "scaled:scale")= num 0.403
##  $ scaled_volatility           : num [1:3157, 1] -0.717 0.437 -0.469 -0.773 -0.268 ...
##   ..- attr(*, "scaled:center")= num 0.476
##   ..- attr(*, "scaled:scale")= num 0.418
##  $ scaled_sharpe_ratio         : num [1:3157, 1] -0.4259 0.4523 0.0118 -0.5794 0.1957 ...
##   ..- attr(*, "scaled:center")= num -92.4
##   ..- attr(*, "scaled:scale")= num 120
##  $ scaled_market_share         : num [1:3157, 1] -0.174 -0.177 -0.174 -0.15 -0.171 ...
##   ..- attr(*, "scaled:center")= num 1.51
##   ..- attr(*, "scaled:scale")= num 8.49
##  $ scaled_market_cap           : num [1:3157, 1] -0.111 -0.199 -0.151 -0.198 -0.179 ...
##   ..- attr(*, "scaled:center")= num 6.64e+11
##   ..- attr(*, "scaled:scale")= num 3.32e+12
##  $ scaled_abs_corr_sp          : num [1:3157, 1] 1.047 -0.766 0.663 -1.333 0.481 ...
##   ..- attr(*, "scaled:center")= num 0.488
##   ..- attr(*, "scaled:scale")= num 0.221
##  $ scaled_tracking_error       : num [1:3157, 1] -0.649 0.181 -0.55 -0.153 -0.374 ...
##   ..- attr(*, "scaled:center")= num 0.0458
##   ..- attr(*, "scaled:scale")= num 0.0359
##  $ Mkt_beta                    : num [1:3157] 0.00362 0.01878 0.01514 0.00029 0.00657 ...
##  $ SMB_beta                    : num [1:3157] 0.001857 0.003485 0.005584 -0.000635 0.013586 ...
##  $ HML_beta                    : num [1:3157] -0.00244 0.00175 -0.00202 -0.002 0.00093 ...
##  $ company_sizeMedium          : num [1:3157] 1 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:3] 1 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ company_size: chr "contr.treatment"
##  $ company_sizeSmall           : num [1:3157] 0 1 1 1 1 0 0 0 0 1 ...
##   ..- attr(*, "assign")= int [1:3] 1 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ company_size: chr "contr.treatment"
##  $ company_sizeLarge           : num [1:3157] 0 0 0 0 0 1 1 1 1 0 ...
##   ..- attr(*, "assign")= int [1:3] 1 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ company_size: chr "contr.treatment"
##  $ SECTORConsumer_Staples      : num [1:3157] 1 0 0 1 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORUnknown               : num [1:3157] 0 1 0 0 1 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORInformation_Technology: num [1:3157] 0 0 1 0 0 1 1 0 0 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORFinancials            : num [1:3157] 0 0 0 0 0 0 0 1 0 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORIndustrials           : num [1:3157] 0 0 0 0 0 0 0 0 1 1 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORHealth_Care           : num [1:3157] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORConsumer_Discretionary: num [1:3157] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORMaterials             : num [1:3157] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORReal_Estate           : num [1:3157] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORUtilities             : num [1:3157] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTOREnergy                : num [1:3157] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORCommunication_Services: num [1:3157] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
## # A tibble: 10 × 55
##    PERMNO  returns SHRCD TICKER SECTOR            company_size    VOL dollar_vol
##    <chr>     <dbl> <int> <chr>  <fct>             <fct>         <dbl>      <dbl>
##  1 10026   0.0192     11 JJSF   Consumer_Staples  Medium       7.59e6    1.44e 9
##  2 10028   0.126      11 DGSE   Unknown           Small        4.65e6    6.27e 6
##  3 10032   0.0109     11 PLXS   Information_Tech… Small        1.32e7    9.09e 8
##  4 10044  -0.00613    11 RMCF   Consumer_Staples  Small        6.59e5    5.93e 6
##  5 10051  -0.00417    11 HNGR   Unknown           Small        3.27e7    7.24e 8
##  6 10104   0.0211     11 ORCL   Information_Tech… Large        9.55e8    5.21e10
##  7 10107   0.0744     11 MSFT   Information_Tech… Large        1.84e9    2.61e11
##  8 10138   0.0732     11 TROW   Financials        Large        7.70e7    8.87e 9
##  9 10145   0.0387     11 HON    Industrials       Large        2.26e8    3.85e10
## 10 10158   0.205      11 AMRC   Industrials       Small        9.85e6    1.52e 8
## # ℹ 47 more variables: bid_ask_spread <dbl>, turnover_ratio <dbl>,
## #   volatility <dbl>, sharpe_ratio <dbl>, market_share <dbl>, market_cap <dbl>,
## #   abs_corr_sp <dbl>, security_type <chr>, tracking_error <dbl>,
## #   log_vol <dbl>, log_dollar_vol <dbl>, log_spread <dbl>,
## #   log_turnover_ratio <dbl>, log_volatility <dbl>, log_market_share <dbl>,
## #   log_market_cap <dbl>, log_tracking_error <dbl>, vol_cap_ratio <dbl>,
## #   log_vol_cap_ratio <dbl>, scaled_vol <dbl[,1]>, …

Correlation of variable

#Find the correlation between retruns and each potential explanatory variables
cor_p1_m5 <- cor(task2_reg_p1_df3%>% select(-c(PERMNO, SHRCD, TICKER, SECTOR, company_size, security_type)), use = "pairwise.complete.obs")

# Correlation over absolute 0.7 = high risk of multicollinearity
high_corr_p1_m5 <- which(abs(cor_p1_m5) > 0.7, arr.ind = TRUE)
high_corr_pairs_p1_m5 <- data.frame(
  Feature1 = rownames(cor_p1_m5)[high_corr_p1_m5[,1]],
  Feature2 = colnames(cor_p1_m5)[high_corr_p1_m5[,2]],
  Correlation = cor_p1_m5[high_corr_p1_m5]
)
high_corr_pairs_p1_m5 <- high_corr_pairs_p1_m5 %>% filter(Correlation < 1)

# Inspect the variables with multicollinearity
head(high_corr_pairs_p1_m5)
##            Feature1       Feature2 Correlation
## 1        market_cap     dollar_vol   0.8922280
## 2 scaled_market_cap     dollar_vol   0.8922280
## 3        log_spread bid_ask_spread   0.7390434
## 4    log_volatility     volatility   0.7927991
## 5    log_volatility   sharpe_ratio   0.7391650
## 6        dollar_vol     market_cap   0.8922280
# Inspect all correlations
head(melt(cor_p1_m5))
##             Var1    Var2        value
## 1        returns returns  1.000000000
## 2            VOL returns  0.009516856
## 3     dollar_vol returns  0.061734657
## 4 bid_ask_spread returns -0.023646635
## 5 turnover_ratio returns  0.041818227
## 6     volatility returns  0.064446949

Build Model 5

# Splitting the dataset into training and testing sets
set.seed(847)  

# Non-COVID Period
X_p1_m5 <- task2_reg_p1_df3 %>% 
  select(Mkt_beta, SMB_beta, HML_beta, log_turnover_ratio, log_volatility, vol_cap_ratio, SECTORMaterials, SECTORIndustrials, SECTORFinancials, SECTORInformation_Technology, SECTORConsumer_Discretionary, SECTORHealth_Care, SECTOREnergy, SECTORUtilities, SECTORReal_Estate, SECTORCommunication_Services, SECTORUnknown)

y_p1_m5 <- task2_reg_p1_df3$returns

splitIndex_p1_m5 <- createDataPartition(y_p1_m5, 
                                        p = 0.8, 
                                        list = FALSE)
X_train_data_p1_m5 <- X_p1_m5[splitIndex_p1_m5, ]
X_test_data_p1_m5 <- X_p1_m5[-splitIndex_p1_m5, ]
y_train_data_p1_m5 <- y_p1_m5[splitIndex_p1_m5]
y_test_data_p1_m5 <- y_p1_m5[-splitIndex_p1_m5]

train_data_p1_m5 <- cbind(X_train_data_p1_m5, returns = y_train_data_p1_m5)

# Build Model
model_5_p1 <- lm(returns ~ ., data = train_data_p1_m5)

# Prediction for training sample
pred_train_p1_m5 <- predict(model_5_p1, newdata = X_train_data_p1_m5)

metrics_train_p1_m5 <- postResample(pred = pred_train_p1_m5, 
                                    obs = train_data_p1_m5$returns)
metrics_table_train_p1_m5 <- data.frame(
  Metric = c('MSE', 'RMSE', 'MAE', 'R-squared'),
  Value = c(metrics_train_p1_m5["RMSE"]^2,
            metrics_train_p1_m5["RMSE"], 
            metrics_train_p1_m5["MAE"], 
            metrics_train_p1_m5["Rsquared"])
)

# Prediction for testing sample
pred_test_p1_m5 <- predict(model_5_p1, newdata = X_test_data_p1_m5)
metrics_test_p1_m5 <- postResample(pred = pred_test_p1_m5, 
                                   obs = y_test_data_p1_m5)
metrics_table_test_p1_m5 <- data.frame(
  Metric = c('MSE', 'RMSE', 'MAE', 'R-squared'),
  Value = c(metrics_test_p1_m5["RMSE"]^2
, metrics_test_p1_m5["RMSE"], metrics_test_p1_m5["MAE"], metrics_test_p1_m5["Rsquared"])
)
## Performance Metrics for the Training Dataset:
##      Metric       Value
## 1       MSE 0.005882033
## 2      RMSE 0.076694414
## 3       MAE 0.057478969
## 4 R-squared 0.089301378
## Performance Metrics for the Testing Dataset:
##      Metric       Value
## 1       MSE 0.006180838
## 2      RMSE 0.078618304
## 3       MAE 0.057724385
## 4 R-squared 0.054573909
## Regression coefficient of Model 1:
## 
## Call:
## lm(formula = returns ~ ., data = train_data_p1_m5)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.23497 -0.04214 -0.00136  0.04151  0.25555 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   0.0829358  0.0124833   6.644 3.74e-11 ***
## Mkt_beta                     -0.3487390  0.2313241  -1.508  0.13179    
## SMB_beta                     -0.7431799  0.1642898  -4.524 6.36e-06 ***
## HML_beta                     -0.9928276  0.1839652  -5.397 7.42e-08 ***
## log_turnover_ratio            0.0072711  0.0015790   4.605 4.33e-06 ***
## log_volatility                0.0114468  0.0029190   3.921 9.04e-05 ***
## vol_cap_ratio                 0.0900588  0.0688961   1.307  0.19128    
## SECTORMaterials              -0.0231078  0.0118721  -1.946  0.05172 .  
## SECTORIndustrials             0.0084369  0.0090790   0.929  0.35284    
## SECTORFinancials             -0.0165188  0.0088529  -1.866  0.06217 .  
## SECTORInformation_Technology  0.0295801  0.0092036   3.214  0.00133 ** 
## SECTORConsumer_Discretionary -0.0009514  0.0092070  -0.103  0.91771    
## SECTORHealth_Care             0.0168308  0.0090503   1.860  0.06305 .  
## SECTOREnergy                 -0.0083266  0.0119167  -0.699  0.48478    
## SECTORUtilities               0.0056029  0.0131535   0.426  0.67017    
## SECTORReal_Estate            -0.0059908  0.0189206  -0.317  0.75155    
## SECTORCommunication_Services  0.0338927  0.0117035   2.896  0.00381 ** 
## SECTORUnknown                 0.0061192  0.0084326   0.726  0.46812    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.07697 on 2510 degrees of freedom
## Multiple R-squared:  0.0893, Adjusted R-squared:  0.08313 
## F-statistic: 14.48 on 17 and 2510 DF,  p-value: < 2.2e-16
## To make sure the variable are not multicollinear (VIF < 5)
##                     Mkt_beta                     SMB_beta 
##                     1.168363                     1.319472 
##                     HML_beta           log_turnover_ratio 
##                     1.213377                     1.340978 
##               log_volatility                vol_cap_ratio 
##                     1.454083                     1.147608 
##              SECTORMaterials            SECTORIndustrials 
##                     1.731422                     3.529172 
##             SECTORFinancials SECTORInformation_Technology 
##                     4.169097                     3.403189 
## SECTORConsumer_Discretionary            SECTORHealth_Care 
##                     3.360413                     4.017917 
##                 SECTOREnergy              SECTORUtilities 
##                     1.901831                     1.543359 
##            SECTORReal_Estate SECTORCommunication_Services 
##                     1.199001                     1.747773 
##                SECTORUnknown 
##                     5.511434
## Despite SectorUnknown has a VIF over 5, considering it is actually the group of data with missing value in Sector, while some other industry categories are important to the model. After trial and error, it is better to keep the industry variable in the model.
# Scatterplot of Prediction vs Actual
plot_data_p1_m5 <- data.frame(
  Predicted = pred_test_p1_m5,
  Actual = y_test_data_p1_m5
)

plot_m5_p1 <- ggplot(data = plot_data_p1_m5,
                     aes(x = Predicted,
                         y = Actual)) +
  geom_point() +
  geom_smooth(method = "lm",
              se = FALSE,
              color = "blue",
              aes(linetype = "Regression Line")) +
  scale_linetype_manual(name = NULL,
                        values = "solid",
                        labels = "Regression Line") +
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(title = "Evaluating Model 5's Performance: Predicted vs. Actual Returns in the Non-COVID Period",
       x = "Predicted returns",
       y = "Actual returns")

print(plot_m5_p1)

Important variable

# Extract coefficients
coef_m5 <- coef(model_5_p1)

importance_m5 <- data.frame(
  Factors = names(coef_m5),
  Coefficient = coef_m5
)
importance_m5 <- importance_m5[importance_m5$Factor != "(Intercept)", ]

# Plot with positive and negative impacts
plot_m5_p1_factors <- ggplot(importance_m5, aes(x = reorder(Factors, Coefficient), y = Coefficient, fill = Coefficient > 0)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = round(Coefficient, 3)),
            vjust = ifelse(importance_m5$Coefficient > 0, 0.5, 0.3),
            color = "black") +
  coord_flip() +
  labs(title = "Impact of factors on Returns for Model 5",
       x = "Factors",
       y = "Coefficient") +
  theme_minimal() +
  scale_fill_manual(
    values = c("TRUE" = "steelblue", "FALSE" = "coral"),
    labels = c("TRUE" = "Positive Impact", 
               "FALSE" = "Negative Impact"),
    name = ""
  ) +
  theme(
    axis.title.y = element_text(color = 'black'),
    axis.text.y = element_text(color = 'black'),
    legend.position = 'bottom'
  ) 

print(plot_m5_p1_factors)

Group by Sector

# Apply the model particularly by industry
pred_sector_p1_m5  <- cbind(X_test_data_p1_m5, SECTOR = task2_reg_p1_df3$SECTOR[-splitIndex_p1_m5])

data_sector_p1_m5 <- data.frame(
  Predicted = pred_test_p1_m5,
  Actual = y_test_data_p1_m5,
  SECTOR = pred_sector_p1_m5$SECTOR
)

data_sector_p1_m5$SECTOR <- as.character(data_sector_p1_m5$SECTOR)

# Calculate evaluation metrics to see the performance of model apply on SECTOR
metrics_table_sector_p1_m5 <- data_sector_p1_m5 %>% 
  group_by(SECTOR) %>% 
  summarise(
    MSE = mean((Actual - Predicted)^2),
    RMSE = sqrt(MSE),
    MAE = mean(abs(Actual - Predicted)),
    residuals = sum((Actual - Predicted)^2),
    total = sum((Actual - mean(Actual))^2),
    R_Squared = 1 - (residuals / total)
  ) %>% 
  select(-c(residuals, total)) 

print(metrics_table_sector_p1_m5)
## # A tibble: 12 × 5
##    SECTOR                     MSE   RMSE    MAE R_Squared
##    <chr>                    <dbl>  <dbl>  <dbl>     <dbl>
##  1 Communication_Services 0.00307 0.0554 0.0491   0.0476 
##  2 Consumer_Discretionary 0.00543 0.0737 0.0535   0.0374 
##  3 Consumer_Staples       0.00458 0.0677 0.0517  -0.0289 
##  4 Energy                 0.0139  0.118  0.0995  -0.0464 
##  5 Financials             0.00155 0.0394 0.0309   0.142  
##  6 Health_Care            0.0103  0.101  0.0792   0.00111
##  7 Industrials            0.00630 0.0794 0.0569   0.0526 
##  8 Information_Technology 0.00820 0.0906 0.0717  -0.00333
##  9 Materials              0.00700 0.0837 0.0632  -0.360  
## 10 Real_Estate            0.00679 0.0824 0.0638   0.111  
## 11 Unknown                0.00653 0.0808 0.0591   0.00328
## 12 Utilities              0.00198 0.0445 0.0303  -0.0203
# Plot the prediction vs actual by SECTOR
plot_m5_p1_sector <- ggplot(data = data_sector_p1_m5, 
                     aes(x = Predicted, 
                         y = Actual,
                         color = SECTOR)) +
  geom_point(size = 2) +
  geom_smooth(method = "lm", 
              se = FALSE, 
              color = "blue", 
              aes(linetype = "Regression Line")) + 
  scale_linetype_manual(name = NULL, 
                        values = "solid", 
                        labels = "Regression Line") + 
  theme_minimal() +
  theme(legend.position = "bottom",
        plot.title = element_text(size = 14, face = "bold"),
        plot.subtitle = element_text(size = 12),
        strip.text = element_text(size = 12, face = "bold")) +  
  labs(title = "Evaluating Model 5's Performance: Predicted vs. Actual Returns",
       subtitle = "During the Non-COVID Period (Grouped by Industry Sector)",  
       x = "Predicted returns", 
       y = "Actual returns",
       color = "Industry Sector")


print(plot_m5_p1_sector)

Group by Company size

# Apply the model particularly by company size
pred_size_p1_m5  <- cbind(X_test_data_p1_m5, company_size = task2_reg_p1_df3$company_size[-splitIndex_p1_m5])

data_size_p1_m5 <- data.frame(
  Predicted = pred_test_p1_m5,
  Actual = y_test_data_p1_m5,
  company_size = pred_size_p1_m5$company_size
)

# Calculate evaluation metrics to see the performance of model apply on company size
metrics_table_size_p1_m5 <- data_size_p1_m5 %>% 
  group_by(company_size) %>% 
  summarise(
    MSE = mean((Actual - Predicted)^2),
    RMSE = sqrt(MSE),
    MAE = mean(abs(Actual - Predicted)),
    residuals = sum((Actual - Predicted)^2),
    total = sum((Actual - mean(Actual))^2),
    R_Squared = 1 - (residuals / total)
  ) %>% 
  select(-c(residuals, total))

print(metrics_table_size_p1_m5)
## # A tibble: 3 × 5
##   company_size     MSE   RMSE    MAE R_Squared
##   <fct>          <dbl>  <dbl>  <dbl>     <dbl>
## 1 Medium       0.00481 0.0694 0.0499    0.0325
## 2 Small        0.00745 0.0863 0.0648    0.0140
## 3 Large        0.00236 0.0486 0.0368    0.131
# Plot the prediction vs actual by company size
plot_m5_p1_size <- ggplot(data = data_size_p1_m5, 
                     aes(x = Predicted, 
                         y = Actual,
                         color = company_size)) +
  geom_point(size = 2, alpha = 0.7) +
  geom_smooth(method = "lm", 
              se = FALSE, 
              color = "blue", 
              aes(linetype = "Regression Line")) + 
  scale_linetype_manual(name = NULL, 
                        values = "solid", 
                        labels = "Regression Line") + 
  theme_minimal() +
  theme(legend.position = "bottom",
        plot.title = element_text(size = 14, face = "bold"),
        plot.subtitle = element_text(size = 12),
        strip.text = element_text(size = 12, face = "bold")) +  
  labs(title = "Evaluating Model 5's Performance: Predicted vs. Actual Returns",
       subtitle = "During the Non-COVID Period (Grouped by Company Size)",  
       x = "Predicted returns", 
       y = "Actual returns",
       color = "Company Size")


print(plot_m5_p1_size)

Model 6: (Stock) Returns: 2020-02-14 to 2020-03-20 & X variables: before 2019-02-14

# Period 2 - Dataset 3
task2_p2_df3_1 <- transform_dataset_for_task2(task2, "2020-02-14", "2020-03-20", "2019-08-20", "2020-02-14", "stocks", FFF_beta_covid)
## Highest Date: [1] "2020-02-14"
## 
## Lowest Date: [1] "2020-03-18"
## 
## Number of PERMNO in the dataset that calculated returns:  5764 
## Number of PERMNO that are in the dataset that aggregate x variables:  3701 
## Number of PERMNO that are in the both datasets:  3595 
## Number of PERMNO that are in the final dataset:  3581
# Remove outlier for returns
task2_p2_df3_1 <- remove_outliers_all(task2_p2_df3_1, "returns")

# Inspect the distribution of Returns
print(
  ggplot(task2_p2_df3_1, aes(x = returns)) +
    geom_histogram(bins = 30, fill = "blue", color = "black", alpha = 0.7) +
    theme_minimal() +
    labs(title = "Histogram of Returns", x = "Returns", y = "Frequency")
)

cat("Sample size after remove outlier: ", length(unique(task2_p2_df3_1$PERMNO)))
## Sample size after remove outlier:  3363
# Create dummy variable for categorical variable
task2_reg_p2_df3 <- task2_p2_df3_1 %>%
  filter(!is.na(volatility)) %>% 
  mutate(company_size = factor(company_size, levels = unique(task2_p2_df3_1$company_size)),
         SECTOR = factor(SECTOR, levels = unique(task2_p2_df3_1$SECTOR))) %>%
  bind_cols(as_tibble(model.matrix(~ company_size - 1, data = .))) %>%
  bind_cols(as_tibble(model.matrix(~ SECTOR - 1, data = .))) 

# Inspect any missing value in important variables
colSums(is.na(task2_reg_p2_df3))
##                       PERMNO                      returns 
##                            0                            0 
##                        SHRCD                       TICKER 
##                            0                            0 
##                       SECTOR                 company_size 
##                            0                            0 
##                          VOL                   dollar_vol 
##                            0                            0 
##               bid_ask_spread               turnover_ratio 
##                            0                            0 
##                   volatility                 sharpe_ratio 
##                            0                            0 
##                 market_share                   market_cap 
##                            0                            0 
##                  abs_corr_sp                security_type 
##                            0                            0 
##               tracking_error                      log_vol 
##                            0                            0 
##               log_dollar_vol                   log_spread 
##                            0                            0 
##           log_turnover_ratio               log_volatility 
##                            0                            0 
##             log_market_share               log_market_cap 
##                            0                            0 
##           log_tracking_error                vol_cap_ratio 
##                            0                            0 
##            log_vol_cap_ratio                              
##                            0                            0 
##                                                           
##                            0                            0 
##                                                           
##                            0                            0 
##                                                           
##                            0                            0 
##                                                           
##                            0                            0 
##                                                  Mkt_beta 
##                            0                            0 
##                     SMB_beta                     HML_beta 
##                            0                            0 
##           company_sizeMedium            company_sizeSmall 
##                            0                            0 
##            company_sizeLarge       SECTORConsumer_Staples 
##                            0                            0 
##                SECTORUnknown SECTORInformation_Technology 
##                            0                            0 
##             SECTORFinancials            SECTORIndustrials 
##                            0                            0 
##            SECTORHealth_Care SECTORConsumer_Discretionary 
##                            0                            0 
##              SECTORMaterials            SECTORReal_Estate 
##                            0                            0 
##              SECTORUtilities                 SECTOREnergy 
##                            0                            0 
## SECTORCommunication_Services 
##                            0
print(task2_reg_p2_df3, n = 10)
## # A tibble: 3,335 × 55
##    PERMNO returns SHRCD TICKER SECTOR             company_size    VOL dollar_vol
##    <chr>    <dbl> <int> <chr>  <fct>              <fct>         <dbl>      <dbl>
##  1 10026   0.504     11 JJSF   Consumer_Staples   Medium       1.19e7    2.20e 9
##  2 10028   0.0106    11 DGSE   Unknown            Small        9.71e6    1.62e 7
##  3 10032   0.965     11 PLXS   Information_Techn… Small        2.09e7    1.50e 9
##  4 10044   0.551     11 RMCF   Consumer_Staples   Small        1.41e6    1.25e 7
##  5 10051   1.06      11 HNGR   Unknown            Small        4.25e7    9.85e 8
##  6 10104   0.173     11 ORCL   Information_Techn… Large        1.37e9    7.47e10
##  7 10107   0.320     11 MSFT   Information_Techn… Large        3.00e9    4.57e11
##  8 10138   0.267     11 TROW   Financials         Large        1.21e8    1.46e10
##  9 10145   0.512     11 HON    Industrials        Large        3.35e8    5.77e10
## 10 10158   0.259     11 AMRC   Industrials        Small        1.69e7    2.88e 8
## # ℹ 3,325 more rows
## # ℹ 47 more variables: bid_ask_spread <dbl>, turnover_ratio <dbl>,
## #   volatility <dbl>, sharpe_ratio <dbl>, market_share <dbl>, market_cap <dbl>,
## #   abs_corr_sp <dbl>, security_type <chr>, tracking_error <dbl>,
## #   log_vol <dbl>, log_dollar_vol <dbl>, log_spread <dbl>,
## #   log_turnover_ratio <dbl>, log_volatility <dbl>, log_market_share <dbl>,
## #   log_market_cap <dbl>, log_tracking_error <dbl>, vol_cap_ratio <dbl>, …
## tibble [3,335 × 55] (S3: tbl_df/tbl/data.frame)
##  $ PERMNO                      : chr [1:3335] "10026" "10028" "10032" "10044" ...
##  $ returns                     : num [1:3335] 0.5036 0.0106 0.9654 0.5507 1.0622 ...
##  $ SHRCD                       : int [1:3335] 11 11 11 11 11 11 11 11 11 11 ...
##  $ TICKER                      : chr [1:3335] "JJSF" "DGSE" "PLXS" "RMCF" ...
##  $ SECTOR                      : Factor w/ 12 levels "Consumer_Staples",..: 1 2 3 1 2 3 3 4 5 5 ...
##  $ company_size                : Factor w/ 3 levels "Medium","Small",..: 1 2 2 2 2 3 3 3 3 2 ...
##  $ VOL                         : num [1:3335] 11872557 9713299 20878422 1408090 42541489 ...
##  $ dollar_vol                  : num [1:3335] 2.20e+09 1.62e+07 1.50e+09 1.25e+07 9.85e+08 ...
##  $ bid_ask_spread              : num [1:3335] 0.001024 0.014651 0.000675 0.011492 0.000678 ...
##  $ turnover_ratio              : num [1:3335] 0.00507 0.00291 0.00577 0.00189 0.0092 ...
##  $ volatility                  : num [1:3335] 0.195 0.618 0.238 0.15 0.296 ...
##  $ sharpe_ratio                : num [1:3335] -137.3 -41.7 -109.6 -162.8 -88.8 ...
##  $ market_share                : num [1:3335] 0.0422 0.024 0.0359 0.3338 0.0776 ...
##  $ market_cap                  : num [1:3335] 4.36e+11 4.72e+09 2.55e+11 6.66e+09 1.09e+11 ...
##  $ abs_corr_sp                 : num [1:3335] 0.72 0.318 0.635 0.193 0.595 ...
##  $ security_type               : chr [1:3335] "Stock" "Stock" "Stock" "Stock" ...
##  $ tracking_error              : num [1:3335] 0.0225 0.0523 0.0261 0.0403 0.0324 ...
##  $ log_vol                     : num [1:3335] 16.3 16.1 16.9 14.2 17.6 ...
##  $ log_dollar_vol              : num [1:3335] 21.5 16.6 21.1 16.3 20.7 ...
##  $ log_spread                  : num [1:3335] -6.88 -4.22 -7.3 -4.47 -7.29 ...
##  $ log_turnover_ratio          : num [1:3335] -5.28 -5.84 -5.15 -6.27 -4.69 ...
##  $ log_volatility              : num [1:3335] -1.637 -0.482 -1.434 -1.896 -1.218 ...
##  $ log_market_share            : num [1:3335] -3.16 -3.73 -3.33 -1.1 -2.56 ...
##  $ log_market_cap              : num [1:3335] 26.8 22.3 26.3 22.6 25.4 ...
##  $ log_tracking_error          : num [1:3335] -3.79 -2.95 -3.65 -3.21 -3.43 ...
##  $ vol_cap_ratio               : num [1:3335] 2.72e-05 2.06e-03 8.18e-05 2.11e-04 3.91e-04 ...
##  $ log_vol_cap_ratio           : num [1:3335] -10.51 -6.19 -9.41 -8.46 -7.85 ...
##  $ scaled_vol                  : num [1:3335, 1] -0.323 -0.329 -0.299 -0.351 -0.242 ...
##   ..- attr(*, "scaled:center")= num 1.34e+08
##   ..- attr(*, "scaled:scale")= num 3.77e+08
##  $ scaled_dollar_vol           : num [1:3335, 1] -0.147 -0.22 -0.171 -0.22 -0.188 ...
##   ..- attr(*, "scaled:center")= num 6.65e+09
##   ..- attr(*, "scaled:scale")= num 3.02e+10
##  $ scaled_spread               : num [1:3335, 1] -0.447 0.577 -0.473 0.339 -0.473 ...
##   ..- attr(*, "scaled:center")= num 0.00698
##   ..- attr(*, "scaled:scale")= num 0.0133
##  $ scaled_turnover_ratio       : num [1:3335, 1] -0.0442 -0.0523 -0.0415 -0.0561 -0.0287 ...
##   ..- attr(*, "scaled:center")= num 0.0168
##   ..- attr(*, "scaled:scale")= num 0.266
##  $ scaled_volatility           : num [1:3335, 1] -0.692 0.369 -0.582 -0.803 -0.438 ...
##   ..- attr(*, "scaled:center")= num 0.471
##   ..- attr(*, "scaled:scale")= num 0.399
##  $ scaled_sharpe_ratio         : num [1:3335, 1] -0.3362 0.4691 -0.1031 -0.5506 0.0721 ...
##   ..- attr(*, "scaled:center")= num -97.4
##   ..- attr(*, "scaled:scale")= num 119
##  $ scaled_market_share         : num [1:3335, 1] -0.174 -0.176 -0.175 -0.15 -0.171 ...
##   ..- attr(*, "scaled:center")= num 2.12
##   ..- attr(*, "scaled:scale")= num 12
##  $ scaled_market_cap           : num [1:3335, 1] -0.112 -0.194 -0.146 -0.194 -0.175 ...
##   ..- attr(*, "scaled:center")= num 1.02e+12
##   ..- attr(*, "scaled:scale")= num 5.22e+12
##  $ scaled_abs_corr_sp          : num [1:3335, 1] 1.054 -0.755 0.671 -1.32 0.49 ...
##   ..- attr(*, "scaled:center")= num 0.486
##   ..- attr(*, "scaled:scale")= num 0.222
##  $ scaled_tracking_error       : num [1:3335, 1] -0.653 0.178 -0.554 -0.157 -0.378 ...
##   ..- attr(*, "scaled:center")= num 0.0459
##   ..- attr(*, "scaled:scale")= num 0.0359
##  $ Mkt_beta                    : num [1:3335] 0.0017 0.014261 0.014132 0.000384 0.003971 ...
##  $ SMB_beta                    : num [1:3335] 0.00197 -0.00226 0.00549 0.00192 0.01299 ...
##  $ HML_beta                    : num [1:3335] -0.000521 0.000203 0.000435 -0.002509 0.002974 ...
##  $ company_sizeMedium          : num [1:3335] 1 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:3] 1 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ company_size: chr "contr.treatment"
##  $ company_sizeSmall           : num [1:3335] 0 1 1 1 1 0 0 0 0 1 ...
##   ..- attr(*, "assign")= int [1:3] 1 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ company_size: chr "contr.treatment"
##  $ company_sizeLarge           : num [1:3335] 0 0 0 0 0 1 1 1 1 0 ...
##   ..- attr(*, "assign")= int [1:3] 1 1 1
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ company_size: chr "contr.treatment"
##  $ SECTORConsumer_Staples      : num [1:3335] 1 0 0 1 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORUnknown               : num [1:3335] 0 1 0 0 1 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORInformation_Technology: num [1:3335] 0 0 1 0 0 1 1 0 0 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORFinancials            : num [1:3335] 0 0 0 0 0 0 0 1 0 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORIndustrials           : num [1:3335] 0 0 0 0 0 0 0 0 1 1 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORHealth_Care           : num [1:3335] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORConsumer_Discretionary: num [1:3335] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORMaterials             : num [1:3335] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORReal_Estate           : num [1:3335] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORUtilities             : num [1:3335] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTOREnergy                : num [1:3335] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
##  $ SECTORCommunication_Services: num [1:3335] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "contrasts")=List of 1
##   .. ..$ SECTOR: chr "contr.treatment"
## # A tibble: 10 × 55
##    PERMNO returns SHRCD TICKER SECTOR             company_size    VOL dollar_vol
##    <chr>    <dbl> <int> <chr>  <fct>              <fct>         <dbl>      <dbl>
##  1 10026   0.504     11 JJSF   Consumer_Staples   Medium       1.19e7    2.20e 9
##  2 10028   0.0106    11 DGSE   Unknown            Small        9.71e6    1.62e 7
##  3 10032   0.965     11 PLXS   Information_Techn… Small        2.09e7    1.50e 9
##  4 10044   0.551     11 RMCF   Consumer_Staples   Small        1.41e6    1.25e 7
##  5 10051   1.06      11 HNGR   Unknown            Small        4.25e7    9.85e 8
##  6 10104   0.173     11 ORCL   Information_Techn… Large        1.37e9    7.47e10
##  7 10107   0.320     11 MSFT   Information_Techn… Large        3.00e9    4.57e11
##  8 10138   0.267     11 TROW   Financials         Large        1.21e8    1.46e10
##  9 10145   0.512     11 HON    Industrials        Large        3.35e8    5.77e10
## 10 10158   0.259     11 AMRC   Industrials        Small        1.69e7    2.88e 8
## # ℹ 47 more variables: bid_ask_spread <dbl>, turnover_ratio <dbl>,
## #   volatility <dbl>, sharpe_ratio <dbl>, market_share <dbl>, market_cap <dbl>,
## #   abs_corr_sp <dbl>, security_type <chr>, tracking_error <dbl>,
## #   log_vol <dbl>, log_dollar_vol <dbl>, log_spread <dbl>,
## #   log_turnover_ratio <dbl>, log_volatility <dbl>, log_market_share <dbl>,
## #   log_market_cap <dbl>, log_tracking_error <dbl>, vol_cap_ratio <dbl>,
## #   log_vol_cap_ratio <dbl>, scaled_vol <dbl[,1]>, …

Correlation of variable

#Find the correlation between retruns and each potential explanatory variables
cor_p2_m6 <- cor(task2_reg_p2_df2%>% select(-c(PERMNO, SHRCD, TICKER, SECTOR, company_size, security_type)), use = "pairwise.complete.obs")

# Correlation over absolute 0.7 = high risk of multicollinearity
high_corr_p2_m6 <- which(abs(cor_p2_m6) > 0.7, arr.ind = TRUE)
high_corr_pairs_p2_m6 <- data.frame(
  Feature1 = rownames(cor_p2_m6)[high_corr_p2_m6[,1]],
  Feature2 = colnames(cor_p2_m6)[high_corr_p2_m6[,2]],
  Correlation = cor_p2_m6[high_corr_p2_m6]
)
high_corr_pairs_p2_m6 <- high_corr_pairs_p2_m6 %>% filter(Correlation < 1)

# Inspect the variables with multicollinearity
head(high_corr_pairs_p2_m6)
##            Feature1       Feature2 Correlation
## 1        market_cap     dollar_vol   0.7210814
## 2 scaled_market_cap     dollar_vol   0.7210814
## 3        log_spread bid_ask_spread   0.7265476
## 4     vol_cap_ratio turnover_ratio   0.7700479
## 5    log_volatility     volatility   0.8074679
## 6    log_volatility   sharpe_ratio   0.7121814
# Inspect all correlations
head(melt(cor_p2_m6))
##             Var1    Var2        value
## 1        returns returns  1.000000000
## 2            VOL returns -0.003687189
## 3     dollar_vol returns -0.005025484
## 4 bid_ask_spread returns  0.025407662
## 5 turnover_ratio returns -0.019845336
## 6     volatility returns  0.418324811

Build Model 6

# Splitting the dataset into training and testing sets
set.seed(847)  

# COVID Period
X_p2_m6 <- task2_reg_p2_df3 %>% 
  select(Mkt_beta, SMB_beta, HML_beta, log_volatility, log_vol_cap_ratio, scaled_abs_corr_sp, company_sizeSmall, company_sizeLarge, SECTORMaterials, SECTORIndustrials, SECTORFinancials, SECTORInformation_Technology, SECTORConsumer_Discretionary, SECTORHealth_Care, SECTOREnergy, SECTORUtilities, SECTORReal_Estate, SECTORCommunication_Services, SECTORUnknown)

y_p2_m6 <- task2_reg_p2_df3$returns

splitIndex_p2_m6 <- createDataPartition(y_p2_m6, 
                                        p = 0.8, 
                                        list = FALSE)
X_train_data_p2_m6 <- X_p2_m6[splitIndex_p2_m6, ]
X_test_data_p2_m6 <- X_p2_m6[-splitIndex_p2_m6, ]
y_train_data_p2_m6 <- y_p2_m6[splitIndex_p2_m6]
y_test_data_p2_m6 <- y_p2_m6[-splitIndex_p2_m6]

train_data_p2_m6 <- cbind(X_train_data_p2_m6, returns = y_train_data_p2_m6)

# Build Model
model_6_p2 <- lm(returns ~ ., data = train_data_p2_m6)

# Prediction for training sample
pred_train_p2_m6 <- predict(model_6_p2, newdata = X_train_data_p2_m6)

metrics_train_p2_m6 <- postResample(pred = pred_train_p2_m6, 
                                    obs = train_data_p2_m6$returns)
metrics_table_train_p2_m6 <- data.frame(
  Metric = c('MSE', 'RMSE', 'MAE', 'R-squared'),
  Value = c(metrics_train_p2_m6["RMSE"]^2,
            metrics_train_p2_m6["RMSE"], 
            metrics_train_p2_m6["MAE"], 
            metrics_train_p2_m6["Rsquared"])
)

# Prediction for testing sample
pred_test_p2_m6 <- predict(model_6_p2, newdata = X_test_data_p2_m6)
metrics_test_p2_m6 <- postResample(pred = pred_test_p2_m6, 
                                   obs = y_test_data_p2_m6)
metrics_table_test_p2_m6 <- data.frame(
  Metric = c('MSE', 'RMSE', 'MAE', 'R-squared'),
  Value = c(metrics_test_p2_m6["RMSE"]^2
, metrics_test_p2_m6["RMSE"], metrics_test_p2_m6["MAE"], metrics_test_p2_m6["Rsquared"])
)
## Performance Metrics for the Training Dataset:
##      Metric     Value
## 1       MSE 0.2097130
## 2      RMSE 0.4579443
## 3       MAE 0.3544116
## 4 R-squared 0.1968778
## Performance Metrics for the Testing Dataset:
##      Metric     Value
## 1       MSE 0.2096475
## 2      RMSE 0.4578728
## 3       MAE 0.3562869
## 4 R-squared 0.1387898
## Regression coefficient of Model 1:
## 
## Call:
## lm(formula = returns ~ ., data = train_data_p2_m6)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7174 -0.3063 -0.0697  0.2466  1.6724 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   0.954031   0.073043  13.061  < 2e-16 ***
## Mkt_beta                      4.270158   1.473787   2.897 0.003794 ** 
## SMB_beta                      2.135912   0.907829   2.353 0.018707 *  
## HML_beta                      2.696746   1.134483   2.377 0.017521 *  
## log_volatility                0.180457   0.020176   8.944  < 2e-16 ***
## log_vol_cap_ratio             0.037193   0.006628   5.611 2.22e-08 ***
## scaled_abs_corr_sp            0.189675   0.014024  13.525  < 2e-16 ***
## company_sizeSmall             0.087009   0.026089   3.335 0.000865 ***
## company_sizeLarge            -0.112466   0.033023  -3.406 0.000670 ***
## SECTORMaterials               0.143236   0.066822   2.144 0.032159 *  
## SECTORIndustrials             0.116480   0.052533   2.217 0.026689 *  
## SECTORFinancials              0.140620   0.051212   2.746 0.006076 ** 
## SECTORInformation_Technology  0.083333   0.052969   1.573 0.115786    
## SECTORConsumer_Discretionary  0.357843   0.054229   6.599 4.99e-11 ***
## SECTORHealth_Care             0.018774   0.050840   0.369 0.711956    
## SECTOREnergy                  0.354840   0.075084   4.726 2.41e-06 ***
## SECTORUtilities              -0.144939   0.082070  -1.766 0.077506 .  
## SECTORReal_Estate             0.284643   0.104170   2.733 0.006327 ** 
## SECTORCommunication_Services  0.237609   0.068935   3.447 0.000576 ***
## SECTORUnknown                 0.194282   0.048248   4.027 5.81e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4597 on 2651 degrees of freedom
## Multiple R-squared:  0.1969, Adjusted R-squared:  0.1911 
## F-statistic:  34.2 on 19 and 2651 DF,  p-value: < 2.2e-16
## To make sure the variable are not multicollinear (VIF < 5)
##                     Mkt_beta                     SMB_beta 
##                     1.296221                     1.233592 
##                     HML_beta               log_volatility 
##                     1.245251                     2.438936 
##            log_vol_cap_ratio           scaled_abs_corr_sp 
##                     2.186028                     2.473524 
##            company_sizeSmall            company_sizeLarge 
##                     1.901892                     1.500542 
##              SECTORMaterials            SECTORIndustrials 
##                     1.758857                     3.284024 
##             SECTORFinancials SECTORInformation_Technology 
##                     4.045598                     3.286238 
## SECTORConsumer_Discretionary            SECTORHealth_Care 
##                     2.890833                     4.355018 
##                 SECTOREnergy              SECTORUtilities 
##                     1.641274                     1.441101 
##            SECTORReal_Estate SECTORCommunication_Services 
##                     1.221472                     1.660585 
##                SECTORUnknown 
##                     5.584303
## Despite SectorUnknown has a VIF over 5, considering it is actually the group of data with missing value in Sector, while some other industry categories are important to the model. After trial and error, it is better to keep the industry variable in the model.
# Scatterplot of Prediction vs Actual
plot_data_p2_m6 <- data.frame(
  Predicted = pred_test_p2_m6,
  Actual = y_test_data_p2_m6
)

plot_m6_p2 <- ggplot(data = plot_data_p2_m6,
                     aes(x = Predicted,
                         y = Actual)) +
  geom_point() +
  geom_smooth(method = "lm",
              se = FALSE,
              color = "blue",
              aes(linetype = "Regression Line")) +
  scale_linetype_manual(name = NULL,
                        values = "solid",
                        labels = "Regression Line") +
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(title = "Evaluating Model 6's Performance: Predicted vs. Actual Returns in the COVID Period",
       x = "Predicted returns",
       y = "Actual returns")

print(plot_m6_p2)

Important variable

# Extract coefficients
coef_m6 <- coef(model_6_p2)

importance_m6 <- data.frame(
  Factors = names(coef_m6),
  Coefficient = coef_m6
)
importance_m6 <- importance_m6[importance_m6$Factor != "(Intercept)", ]

# Plot with positive and negative impacts
plot_m6_p2_factors <- ggplot(importance_m6, aes(x = reorder(Factors, Coefficient), y = Coefficient, fill = Coefficient > 0)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = round(Coefficient, 3)),
            vjust = ifelse(importance_m6$Coefficient > 0, 0.5, 0.3),
            color = "black") +
  coord_flip() +
  labs(title = "Impact of factors on Returns for Model 6",
       x = "Factors",
       y = "Coefficient") +
  theme_minimal() +
  scale_fill_manual(
    values = c("TRUE" = "steelblue", "FALSE" = "coral"),
    labels = c("TRUE" = "Positive Impact", 
               "FALSE" = "Negative Impact"),
    name = ""
  ) +
  theme(
    axis.title.y = element_text(color = 'black'),
    axis.text.y = element_text(color = 'black'),
    legend.position = 'bottom'
  ) 

print(plot_m6_p2_factors)

Group by Sector

# Apply the model particularly by industry
pred_sector_p2_m6  <- cbind(X_test_data_p2_m6, SECTOR = task2_reg_p2_df3$SECTOR[-splitIndex_p2_m6])

data_sector_p2_m6 <- data.frame(
  Predicted = pred_test_p2_m6,
  Actual = y_test_data_p2_m6,
  SECTOR = pred_sector_p2_m6$SECTOR
)

data_sector_p2_m6$SECTOR <- as.character(data_sector_p2_m6$SECTOR)

# Calculate evaluation metrics to see the performance of model apply on SECTOR
metrics_table_sector_p2_m6 <- data_sector_p2_m6 %>% 
  group_by(SECTOR) %>% 
  summarise(
    MSE = mean((Actual - Predicted)^2),
    RMSE = sqrt(MSE),
    MAE = mean(abs(Actual - Predicted)),
    residuals = sum((Actual - Predicted)^2),
    total = sum((Actual - mean(Actual))^2),
    R_Squared = 1 - (residuals / total)
  ) %>% 
  select(-c(residuals, total)) 

print(metrics_table_sector_p2_m6)
## # A tibble: 12 × 5
##    SECTOR                    MSE  RMSE   MAE R_Squared
##    <chr>                   <dbl> <dbl> <dbl>     <dbl>
##  1 Communication_Services 0.312  0.559 0.479   -0.0410
##  2 Consumer_Discretionary 0.224  0.473 0.391    0.0236
##  3 Consumer_Staples       0.204  0.452 0.305    0.0500
##  4 Energy                 0.426  0.653 0.584   -0.0114
##  5 Financials             0.128  0.357 0.287    0.0582
##  6 Health_Care            0.237  0.487 0.387    0.153 
##  7 Industrials            0.217  0.466 0.365    0.0202
##  8 Information_Technology 0.200  0.447 0.337   -0.0113
##  9 Materials              0.133  0.365 0.304   -0.608 
## 10 Real_Estate            0.0577 0.240 0.219    0.427 
## 11 Unknown                0.230  0.480 0.364    0.188 
## 12 Utilities              0.128  0.358 0.288   -0.119
# Plot the prediction vs actual by SECTOR
plot_m6_p2_sector <- ggplot(data = data_sector_p2_m6, 
                     aes(x = Predicted, 
                         y = Actual,
                         color = SECTOR)) +
  geom_point(size = 2) +
  geom_smooth(method = "lm", 
              se = FALSE, 
              color = "blue", 
              aes(linetype = "Regression Line")) + 
  scale_linetype_manual(name = NULL, 
                        values = "solid", 
                        labels = "Regression Line") + 
  theme_minimal() +
  theme(legend.position = "bottom",
        plot.title = element_text(size = 14, face = "bold"),
        plot.subtitle = element_text(size = 12),
        strip.text = element_text(size = 12, face = "bold")) +  
  labs(title = "Evaluating Model 6's Performance: Predicted vs. Actual Returns",
       subtitle = "During the COVID Period (Grouped by Industry Sector)",  
       x = "Predicted returns", 
       y = "Actual returns",
       color = "Industry Sector")


print(plot_m6_p2_sector)

Group by Company size

# Apply the model particularly by company size
pred_size_p2_m6  <- cbind(X_test_data_p2_m6, company_size = task2_reg_p2_df3$company_size[-splitIndex_p2_m6])

data_size_p2_m6 <- data.frame(
  Predicted = pred_test_p2_m6,
  Actual = y_test_data_p2_m6,
  company_size = pred_size_p2_m6$company_size
)

# Calculate evaluation metrics to see the performance of model apply on company size
metrics_table_size_p2_m6 <- data_size_p2_m6 %>% 
  group_by(company_size) %>% 
  summarise(
    MSE = mean((Actual - Predicted)^2),
    RMSE = sqrt(MSE),
    MAE = mean(abs(Actual - Predicted)),
    residuals = sum((Actual - Predicted)^2),
    total = sum((Actual - mean(Actual))^2),
    R_Squared = 1 - (residuals / total)
  ) %>% 
  select(-c(residuals, total))

print(metrics_table_size_p2_m6)
## # A tibble: 3 × 5
##   company_size   MSE  RMSE   MAE R_Squared
##   <fct>        <dbl> <dbl> <dbl>     <dbl>
## 1 Medium       0.205 0.452 0.362    0.116 
## 2 Small        0.223 0.472 0.367    0.118 
## 3 Large        0.141 0.376 0.286    0.0999
# Plot the prediction vs actual by company size
plot_m6_p2_size <- ggplot(data = data_size_p2_m6, 
                     aes(x = Predicted, 
                         y = Actual,
                         color = company_size)) +
  geom_point(size = 2, alpha = 0.7) +
  geom_smooth(method = "lm", 
              se = FALSE, 
              color = "blue", 
              aes(linetype = "Regression Line")) + 
  scale_linetype_manual(name = NULL, 
                        values = "solid", 
                        labels = "Regression Line") + 
  theme_minimal() +
  theme(legend.position = "bottom",
        plot.title = element_text(size = 14, face = "bold"),
        plot.subtitle = element_text(size = 12),
        strip.text = element_text(size = 12, face = "bold")) +  
  labs(title = "Evaluating Model 6's Performance: Predicted vs. Actual Returns",
       subtitle = "During the COVID Period (Grouped by Company Size)",  
       x = "Predicted returns", 
       y = "Actual returns",
       color = "Company Size")


print(plot_m6_p2_size)

Different variable in each model

get_variables <- function(model) {
  # Extract variables from the model
  terms <- attr(model$terms, "term.labels")
  return(terms)
}

ols_m1_v <- get_variables(model_1_p1)
ols_m2_v <- get_variables(model_2_p2)
ols_m3_v <- get_variables(model_3_p1)
ols_m4_v <- get_variables(model_4_p2)
ols_m5_v <- get_variables(model_5_p1)
ols_m6_v <- get_variables(model_6_p2)

ols_v_list <- list(
  Model_1 = as.character(ols_m1_v),
  Model_2 = as.character(ols_m2_v),
  Model_3 = as.character(ols_m3_v),
  Model_4 = as.character(ols_m4_v),
  Model_5 = as.character(ols_m5_v),
  Model_6 = as.character(ols_m6_v)
)

# Initialize empty data frame
ols_model_data <- data.frame(
  model = character(),
  variable = character(),
  order = integer(),  
  stringsAsFactors = FALSE
)

# Populate the dataframe with the original order of variables
for (model in names(ols_v_list)) {
  variables <- ols_v_list[[model]]  
  temp_df <- data.frame(
    model = model,
    variable = variables,
    order = seq_along(variables), 
    stringsAsFactors = FALSE
  )
  ols_model_data <- rbind(ols_model_data, temp_df)
}

# Convert variable to a factor with levels ordered by 'order'
ols_model_data$variable <- factor(ols_model_data$variable, levels = sort(unique(ols_model_data$variable)))

# Generate plot
ols_models_v <- ggplot(ols_model_data, aes(x = model, y = variable, fill = model)) +
  geom_tile(color = "white") + 
  labs(
    title = "Variables Included in Each OLS Regression Model",
    x = "Model",
    y = "Variables"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 12),  
    axis.text.y = element_text(size = 12),  
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14), 
    plot.title = element_text(hjust = 0.5, size = 16), 
    plot.title.position = "plot")

print(ols_models_v)

# Save plot for report
save_plot(ols_models_v, "ols_models_v")

3 Logistic Regressions

Preparation

# load library
library(stats)
library(pROC)
library(PRROC)

Create price change dummy variable

# Use the last and the first date PRC between COVID to create dummy variable represent price increase (1) or decrease (0)
task3_price_dummy <- dataset1_df9 %>% 
  arrange(PERMNO, date) %>% 
  group_by(PERMNO) %>%
  summarise(
    up_PRC = ifelse(last(na.omit(PRC[date <= "2020-03-20"])) > first(na.omit(PRC[date >= "2020-02-14"])), 1, 0))

Inspect up_PRC accuracy

up_PRC_summary <- dataset1_df9 %>% 
  filter(date >= "2020-02-14" & date <= "2020-03-20") %>% 
  arrange(PERMNO, date) %>% 
  group_by(PERMNO) %>%
  summarize(
    first_PRC = first(na.omit(PRC)),
    first_date = date[which(na.omit(PRC) == first_PRC)[1]],
    last_PRC = last(na.omit(PRC)),
    last_date = date[which(na.omit(PRC) == last_PRC)[1]],
    .groups = 'drop'
  ) %>%
  mutate(
    up_PRC = ifelse(last_PRC > first_PRC, 1, 0)
  )

up_PRC_check <- task3_price_dummy %>%
  distinct(PERMNO, up_PRC) %>%
  left_join(up_PRC_summary %>% select(PERMNO, up_PRC), by = "PERMNO", suffix = c("_dataset1", "_dataset2")) %>%
  filter(up_PRC_dataset1 != up_PRC_dataset2) %>%
  select(PERMNO, up_PRC_dataset1, up_PRC_dataset2)

if (nrow(up_PRC_check) > 0) {
  print("These PERMNO has wrong up_PRC")
  print(up_PRC_check$PERMNO)
} else {
  print("All values of up_PRC are checked and accurate.")
}
## [1] "All values of up_PRC are checked and accurate."

Create dataset

task3 <- task2_reg_p2_df1 %>% 
  left_join(task3_price_dummy, by = "PERMNO")

head(task3, n = 5)
## # A tibble: 5 × 58
##   PERMNO returns SHRCD TICKER SECTOR              company_size    VOL dollar_vol
##   <chr>    <dbl> <int> <chr>  <fct>               <fct>         <dbl>      <dbl>
## 1 10026   0.504     11 JJSF   Consumer_Staples    Medium       1.19e7     2.20e9
## 2 10028   0.0106    11 DGSE   Unknown             Small        9.71e6     1.62e7
## 3 10032   0.965     11 PLXS   Information_Techno… Small        2.09e7     1.50e9
## 4 10044   0.551     11 RMCF   Consumer_Staples    Small        1.41e6     1.25e7
## 5 10051   1.06      11 HNGR   Unknown             Small        4.25e7     9.85e8
## # ℹ 50 more variables: bid_ask_spread <dbl>, turnover_ratio <dbl>,
## #   volatility <dbl>, sharpe_ratio <dbl>, market_share <dbl>, market_cap <dbl>,
## #   abs_corr_sp <dbl>, security_type <fct>, tracking_error <dbl>,
## #   log_vol <dbl>, log_dollar_vol <dbl>, log_spread <dbl>,
## #   log_turnover_ratio <dbl>, log_volatility <dbl>, log_market_share <dbl>,
## #   log_market_cap <dbl>, log_tracking_error <dbl>, vol_cap_ratio <dbl>,
## #   log_vol_cap_ratio <dbl>, scaled_vol <dbl[,1]>, …

Model 1: Price Change dummy variable: 2020-02-14 to 2020-03-20 & X variable: before 2020-02-14

Build Model 1 (Without industry)

# Select independent variable
X_logit_m1 <- task3 %>%
  select(Mkt_beta, SMB_beta, HML_beta, log_spread, log_volatility, scaled_market_cap, log_tracking_error, log_turnover_ratio)

y_logit_m1 <- task3$up_PRC

# Split the dataset into training and testing sets
set.seed(675)  

trainIndex <- createDataPartition(y_logit_m1, p = 0.8, list = FALSE)
X_train_logit_m1 <- X_logit_m1[trainIndex, ]
X_test_logit_m1 <- X_logit_m1[-trainIndex, ]
y_train_logit_m1 <- y_logit_m1[trainIndex]
y_test_logit_m1 <- y_logit_m1[-trainIndex]


# Run the logistic regression model
logit_model_1 <- glm(y_train_logit_m1 ~ ., 
                     data = as.data.frame(X_train_logit_m1, y_train_logit_m1),
                     family = binomial())

# Display model summary
summary(logit_model_1)
## 
## Call:
## glm(formula = y_train_logit_m1 ~ ., family = binomial(), data = as.data.frame(X_train_logit_m1, 
##     y_train_logit_m1))
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           2.82283    0.70763   3.989 6.63e-05 ***
## Mkt_beta           -101.76338   11.80252  -8.622  < 2e-16 ***
## SMB_beta             -5.93703    7.78596  -0.763   0.4457    
## HML_beta             -3.06160   10.11749  -0.303   0.7622    
## log_spread           -0.41756    0.09364  -4.459 8.22e-06 ***
## log_volatility       -0.50257    0.09543  -5.266 1.39e-07 ***
## scaled_market_cap    -0.25935    0.35696  -0.727   0.4675    
## log_tracking_error    2.34798    0.21690  10.825  < 2e-16 ***
## log_turnover_ratio    0.18226    0.08268   2.204   0.0275 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1432.0  on 4151  degrees of freedom
## Residual deviance: 1011.6  on 4143  degrees of freedom
## AIC: 1029.6
## 
## Number of Fisher Scoring iterations: 8
# Make sure no multicollinearity
print(vif(logit_model_1))
##           Mkt_beta           SMB_beta           HML_beta         log_spread 
##           1.286861           1.163409           1.053116           2.744029 
##     log_volatility  scaled_market_cap log_tracking_error log_turnover_ratio 
##           2.709422           1.239080           3.067389           1.328285
# Calculate predicted probabilities on testing data
pred_prob_m1 <- predict(logit_model_1, 
                        newdata = as.data.frame(X_test_logit_m1), 
                        type = "response")

# Determine classification based on threshold of 0.5
class_m1 <- ifelse(pred_prob_m1 > 0.5, 1, 0)

# Calculate True Positives, True Negatives, False Positives, False Negatives
true_positives_m1 <- sum(y_test_logit_m1 == 1 & class_m1 == 1)
true_negatives_m1 <- sum(y_test_logit_m1 == 0 & class_m1 == 0)
false_positives_m1 <- sum(y_test_logit_m1 == 0 & class_m1 == 1)
false_negatives_m1 <- sum(y_test_logit_m1 == 1 & class_m1 == 0)

# Calculate Accuracy
accuracy_m1 <- (true_positives_m1 + true_negatives_m1) / (true_positives_m1 + true_negatives_m1 + false_positives_m1 + false_negatives_m1)

# Calculate Precision
precision_m1 <- ifelse((true_positives_m1 + false_positives_m1) == 0, 0, true_positives_m1 / (true_positives_m1 + false_positives_m1))

# Calculate Recall
recall_m1 <- ifelse((true_positives_m1 + false_negatives_m1) == 0, 0, true_positives_m1 / (true_positives_m1 + false_negatives_m1))

# Calculate F1 Score
f1_m1 <- 2 * (precision_m1 * recall_m1) / (precision_m1 + recall_m1)

# Compute ROC curve and AUC
roc_curve_m1 <- roc(y_test_logit_m1, pred_prob_m1)
auc_value_m1 <- auc(roc_curve_m1)

# Calculate McFadden’s R-squared
null_model_1 <- glm(y_train_logit_m1 ~ 1, data = as.data.frame(cbind(X_train_logit_m1, y_train_logit_m1)), family = binomial())
rsquared_logit_m1 <- 1 - (logLik(logit_model_1) / logLik(null_model_1))

# Create a dataframe for counts
metrics_counts_logit_m1 <- data.frame(
  Metric = c('True Positives', 'True Negatives', 'False Positives', 'False Negatives'),
  Count = c(
    as.integer(true_positives_m1), 
    as.integer(true_negatives_m1), 
    as.integer(false_positives_m1), 
    as.integer(false_negatives_m1)
  )
)

# Create a dataframe for performance metrics
metrics_evaluation_logit_m1 <- data.frame(
  Metric = c('Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC', 'R_squared'),
  Value = c(
    accuracy_m1, 
    precision_m1, 
    recall_m1,
    f1_m1,
    auc_value_m1,
    rsquared_logit_m1
  )
)

# Print the results
print(metrics_counts_logit_m1)
##            Metric Count
## 1  True Positives     9
## 2  True Negatives  1011
## 3 False Positives     2
## 4 False Negatives    16
print(metrics_evaluation_logit_m1)
##      Metric     Value
## 1  Accuracy 0.9826590
## 2 Precision 0.8181818
## 3    Recall 0.3600000
## 4  F1 Score 0.5000000
## 5       AUC 0.9167621
## 6 R_squared 0.2935775

Confusion Matrix

true_false_m1 <- matrix(c(true_negatives_m1, false_positives_m1, false_negatives_m1, true_positives_m1),
                        nrow = 2,
                        dimnames = list(Predicted = c("Negative", "Positive"),
                                        Actual = c("Negative", "Positive")))

conf_matrix_m1 <- as.data.frame(as.table(true_false_m1))

# Plot the confusion matrix
palette <- brewer.pal(n = 4, 
                      name = "Greens")  # Define color

conf_matrix_m1_plot <- ggplot(conf_matrix_m1, 
                              aes(x = Predicted, 
                                  y = Actual, 
                                  fill = Freq)) +
  
  geom_tile(color = "black") + 
  
  geom_text(aes(label = Freq), 
            color = "black", 
            size = 6) +
  
  scale_fill_gradientn(colors = brewer.pal(n = 4, 
                                           name = "Greens"), 
                       guide = "none") +
  
  labs(title = "Confusion Matrix  for Model 1: Price Increase Predictions between February 14, 2020 to March 20, 2020",
       x = "Predicted",
       y = "Actual") +
  
  theme_minimal() +
  
  theme(
      plot.title = element_text(size = 16, 
                                face = "bold", 
                                hjust = 0.5),
      axis.title = element_text(size = 14),
      axis.text = element_text(size = 12)
    )

print(conf_matrix_m1_plot)  

# Save the plot for report
save_plot(conf_matrix_m1_plot, "Confusion Matrix for logit model 1")

Important Factor

# Extract coefficients
coefficients_logit_m1 <- coef(logit_model_1)

# Convert coefficients to odds ratios
odds_ratios_m1 <- exp(coefficients_logit_m1)

# Identify intercept and filter it out
importance_logit_m1 <- data.frame(
  Factors = names(coefficients_logit_m1),
  Odds_Ratio = odds_ratios_m1
)

# Exclude the intercept (assuming it's named 'Intercept')
importance_logit_m1 <- importance_logit_m1[importance_logit_m1$Factor != "(Intercept)", ]

# Add a column for fill color based on the odds ratio
importance_logit_m1$Fill_Color <- ifelse(importance_logit_m1$Odds_Ratio > 1, "steelblue", "coral")

# Plot the factor
ggplot(importance_logit_m1, 
       aes(x = reorder(Factors, Odds_Ratio), 
           y = Odds_Ratio, 
           fill = Fill_Color)) +
  
  geom_bar(stat = "identity") +
  
  coord_flip() +
  
  geom_text(aes(label = round(Odds_Ratio, 2)), 
            hjust = 1, 
            size = 3) + 
  
  labs(title = "Impact of Factors on Price increase likelihood for Model 1: Positive Vs Negative Associations",
       x = "Factors",
       y = "Odds Ratio\n(The exponentiation of coefficients)") +
  
  geom_hline(yintercept = 1, 
             linetype = "dashed", 
             color = "red") +
  
  annotate("text", 
           x = Inf, 
           y = 1, 
           label = "Odds Ratio = 1", 
           hjust = 0, 
           vjust = 0, 
           color = "grey", 
           size = 3.5, 
           fontface = "italic") +
  
  theme_minimal() +
  
  scale_fill_manual(
    values = c("steelblue" = "steelblue", "coral" = "coral"),
    labels = c("steelblue" = "Positively Associated with Price Increase", 
               "coral" = "Negatively Associated with Price Increase"),
    name = "") +
  
  theme(
    plot.title = element_text(size = 16, 
                              face = "bold", 
                              hjust = 0.5),
    axis.title = element_text(size = 14),
    axis.text = element_text(size = 12),
    axis.title.y = element_text(color = 'black'),
    axis.text.y = element_text(color = 'black'),
    legend.position = 'bottom'
  ) 

Model 2: Continuous Variables from Model 1 with Industry Variable

# Select independent variable
X_logit_m2 <- task3 %>%
  select(Mkt_beta, SMB_beta, HML_beta, log_spread, log_volatility, scaled_market_cap, log_tracking_error, log_turnover_ratio, SECTORMaterials, SECTORIndustrials, SECTORFinancials, SECTORInformation_Technology, SECTORConsumer_Discretionary, SECTORHealth_Care, SECTOREnergy, SECTORUtilities, SECTORReal_Estate, SECTORCommunication_Services, SECTORUnknown
)

y_logit_m2 <- task3$up_PRC  

# Split the dataset into training and testing sets
set.seed(675) 

trainIndex <- createDataPartition(y_logit_m2, p = 0.8, list = FALSE)
X_train_logit_m2 <- X_logit_m2[trainIndex, ]
X_test_logit_m2 <- X_logit_m2[-trainIndex, ]
y_train_logit_m2 <- y_logit_m2[trainIndex]
y_test_logit_m2 <- y_logit_m2[-trainIndex]


# Fit the logistic regression model
logit_model_2 <- glm(y_train_logit_m2 ~ ., 
                     data = as.data.frame(X_train_logit_m2, y_train_logit_m2),
                     family = binomial())

# Display model summary
summary(logit_model_2)
## 
## Call:
## glm(formula = y_train_logit_m2 ~ ., family = binomial(), data = as.data.frame(X_train_logit_m2, 
##     y_train_logit_m2))
## 
## Coefficients:
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                    3.91523    0.89242   4.387 1.15e-05 ***
## Mkt_beta                     -96.88856   12.07981  -8.021 1.05e-15 ***
## SMB_beta                      -7.54789    8.04238  -0.939 0.347980    
## HML_beta                       1.58545   10.76882   0.147 0.882954    
## log_spread                    -0.46597    0.09768  -4.770 1.84e-06 ***
## log_volatility                -0.43685    0.10269  -4.254 2.10e-05 ***
## scaled_market_cap             -0.30859    0.35838  -0.861 0.389204    
## log_tracking_error             2.42835    0.22352  10.864  < 2e-16 ***
## log_turnover_ratio             0.09634    0.08735   1.103 0.270075    
## SECTORMaterials               -1.95078    0.71579  -2.725 0.006423 ** 
## SECTORIndustrials             -3.11610    0.80881  -3.853 0.000117 ***
## SECTORFinancials              -2.55834    0.66841  -3.827 0.000129 ***
## SECTORInformation_Technology  -2.33281    0.63495  -3.674 0.000239 ***
## SECTORConsumer_Discretionary  -3.57383    1.05670  -3.382 0.000719 ***
## SECTORHealth_Care             -1.13104    0.40944  -2.762 0.005738 ** 
## SECTOREnergy                  -1.89771    0.90855  -2.089 0.036733 *  
## SECTORUtilities               -2.09672    1.06645  -1.966 0.049291 *  
## SECTORReal_Estate            -15.67593  847.50173  -0.018 0.985243    
## SECTORCommunication_Services -15.78635  430.03630  -0.037 0.970717    
## SECTORUnknown                 -1.23672    0.36444  -3.393 0.000690 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1432.02  on 4151  degrees of freedom
## Residual deviance:  957.46  on 4132  degrees of freedom
## AIC: 997.46
## 
## Number of Fisher Scoring iterations: 16
# Ensure no multicollinearity
print(vif(logit_model_2))
##                     Mkt_beta                     SMB_beta 
##                     1.314022                     1.235449 
##                     HML_beta                   log_spread 
##                     1.131153                     2.813206 
##               log_volatility            scaled_market_cap 
##                     3.221248                     1.276669 
##           log_tracking_error           log_turnover_ratio 
##                     3.152487                     1.423055 
##              SECTORMaterials            SECTORIndustrials 
##                     1.263827                     1.190700 
##             SECTORFinancials SECTORInformation_Technology 
##                     1.287306                     1.354766 
## SECTORConsumer_Discretionary            SECTORHealth_Care 
##                     1.095006                     2.923145 
##                 SECTOREnergy              SECTORUtilities 
##                     1.250603                     1.097808 
##            SECTORReal_Estate SECTORCommunication_Services 
##                     1.000000                     1.000001 
##                SECTORUnknown 
##                     3.807168
# Calculate predicted probabilities on testing data
pred_prob_m2 <- predict(logit_model_2, 
                        newdata = as.data.frame(X_test_logit_m2), 
                        type = "response")

# Determine classification based on threshold of 0.5
class_m2 <- ifelse(pred_prob_m2 > 0.5, 1, 0)

# Calculate True Positives, True Negatives, False Positives, False Negatives
true_positives_m2 <- sum(y_test_logit_m2 == 1 & class_m2 == 1)
true_negatives_m2 <- sum(y_test_logit_m2 == 0 & class_m2 == 0)
false_positives_m2 <- sum(y_test_logit_m2 == 0 & class_m2 == 1)
false_negatives_m2 <- sum(y_test_logit_m2 == 1 & class_m2 == 0)

# Calculate Accuracy
accuracy_m2 <- (true_positives_m2 + true_negatives_m2) / (true_positives_m2 + true_negatives_m2 + false_positives_m2 + false_negatives_m2)

# Calculate Precision
precision_m2 <- ifelse((true_positives_m2 + false_positives_m2) == 0, 0, true_positives_m2 / (true_positives_m2 + false_positives_m2))

# Calculate Recall
recall_m2 <- ifelse((true_positives_m2 + false_negatives_m2) == 0, 0, true_positives_m2 / (true_positives_m2 + false_negatives_m2))

# Calculate F1 Score
f1_m2 <- 2 * (precision_m2 * recall_m2) / (precision_m2 + recall_m2)

# Compute ROC curve and AUC
roc_curve_m2 <- roc(y_test_logit_m2, pred_prob_m2)
auc_value_m2 <- auc(roc_curve_m2)

# Calculate McFadden’s R-squared
null_model_2 <- glm(y_train_logit_m2 ~ 1, data = as.data.frame(cbind(X_train_logit_m2, y_train_logit_m2)), family = binomial())
rsquared_logit_m2 <- 1 - (logLik(logit_model_2) / logLik(null_model_2))

# Create a dataframe for counts
metrics_counts_logit_m2 <- data.frame(
  Metric = c('True Positives', 'True Negatives', 'False Positives', 'False Negatives'),
  Count = c(
    as.integer(true_positives_m2), 
    as.integer(true_negatives_m2), 
    as.integer(false_positives_m2), 
    as.integer(false_negatives_m2)
  )
)

# Create a dataframe for performance metrics
metrics_evaluation_logit_m2 <- data.frame(
  Metric = c('Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC', 'R_squared'),
  Value = c(
    accuracy_m2, 
    precision_m2, 
    recall_m2,
    f1_m2,
    auc_value_m2,
    rsquared_logit_m2
  )
)

# Print the results 
print(metrics_counts_logit_m2)
##            Metric Count
## 1  True Positives    10
## 2  True Negatives  1011
## 3 False Positives     2
## 4 False Negatives    15
print(metrics_evaluation_logit_m2)
##      Metric     Value
## 1  Accuracy 0.9836224
## 2 Precision 0.8333333
## 3    Recall 0.4000000
## 4  F1 Score 0.5405405
## 5       AUC 0.8991115
## 6 R_squared 0.3313900

Confusion Matrix

true_false_m2 <- matrix(c(true_negatives_m2, false_positives_m2, false_negatives_m2, true_positives_m2),
                        nrow = 2,
                        dimnames = list(Predicted = c("Negative", "Positive"),
                                        Actual = c("Negative", "Positive")))
                                             
conf_matrix_m2 <- as.data.frame(as.table(true_false_m2))

# Plot the confusion matrix
palette <- brewer.pal(n = 4, 
                      name = "Greens")  # Define color

conf_matrix_m2_plot <- ggplot(conf_matrix_m2, 
                              aes(x = Predicted, 
                                  y = Actual, 
                                  fill = Freq)) +
  
  geom_tile(color = "black") + 
  
  geom_text(aes(label = Freq), 
            color = "black", 
            size = 6) +
  
  scale_fill_gradientn(colors = brewer.pal(n = 4, 
                                           name = "Greens"), 
                       guide = "none") +
  
  labs(title = "Confusion Matrix for Model 2: Price Increase Predictions between February 14, 2020 to March 20, 2020",
       subtitle = "(Model 2: Continuous Variables from Model 1 with Industry Variable)",
       x = "Predicted", 
       y = "Actual") +
  
  theme_minimal() +
  
  theme(
      plot.title = element_text(size = 16, 
                                face = "bold", 
                                hjust = 0.5),
      axis.title = element_text(size = 14),
      axis.text = element_text(size = 12)
    )

print(conf_matrix_m2_plot)  

# Save the plot for report
save_plot(conf_matrix_m2_plot, "Confusion Matrix for logit Model 2")

Important Factor

# Extract coefficients
coefficients_logit_m2 <- coef(logit_model_2)

# Convert coefficients to odds ratios
odds_ratios_m2 <- exp(coefficients_logit_m2)

# Identify intercept and filter it out
importance_logit_m2 <- data.frame(
  Factors = names(coefficients_logit_m2),
  Odds_Ratio = odds_ratios_m2
)

# Exclude the intercept (assuming it's named 'Intercept')
importance_logit_m2 <- importance_logit_m2[importance_logit_m2$Factor != "(Intercept)", ]

# Add a column for fill color based on the odds ratio
importance_logit_m2$Fill_Color <- ifelse(importance_logit_m2$Odds_Ratio > 1, "steelblue", "coral")

# Plot the factor
ggplot(importance_logit_m2, 
       aes(x = reorder(Factors, Odds_Ratio), 
           y = Odds_Ratio, 
           fill = Fill_Color)) +
  
  geom_bar(stat = "identity") +
  
  coord_flip() +
  
  geom_text(aes(label = round(Odds_Ratio, 2)), 
            hjust = 1, 
            size = 3) + 
  
  labs(title = "Impact of Factors on Price increase likelihood for Model 2: Positive Vs Negative Associations",
       subtitle = "(Model 2: Continuous Variables from Model 1 with Industry Variable)",
       x = "Factors",
       y = "Odds Ratio\n(The exponentiation of coefficients)") +
  
  geom_hline(yintercept = 1, 
             linetype = "dashed", 
             color = "red") +
  
  annotate("text", 
           x = Inf, 
           y = 1, 
           label = "Odds Ratio = 1", 
           hjust = 0, 
           vjust = 0, 
           color = "grey", 
           size = 3.5, 
           fontface = "italic") +
  
  theme_minimal() +
  
  scale_fill_manual(
    values = c("steelblue" = "steelblue", "coral" = "coral"),
    labels = c("steelblue" = "Positively Associated with Price Increase", 
               "coral" = "Negatively Associated with Price Increase"),
    name = "") +
  
  theme(
    plot.title = element_text(size = 16, 
                              face = "bold", 
                              hjust = 0.5),
    axis.title = element_text(size = 14),
    axis.text = element_text(size = 12),
    axis.title.y = element_text(color = 'black'),
    axis.text.y = element_text(color = 'black'),
    legend.position = 'bottom'
  ) 

Model 3: Different Continuous Variables with Industry Variable Compared to Model 2

# Select independent variable
X_logit_m3 <- task3 %>%
  select(Mkt_beta, SMB_beta, scaled_dollar_vol, log_spread, log_volatility, log_tracking_error, log_turnover_ratio, SECTORMaterials, SECTORIndustrials, SECTORFinancials, SECTORInformation_Technology, SECTORConsumer_Discretionary, SECTORHealth_Care, SECTOREnergy, SECTORUtilities, SECTORReal_Estate, SECTORCommunication_Services, SECTORUnknown
)

y_logit_m3 <- task3$up_PRC  

# Split the dataset into training and testing sets
set.seed(675) 

trainIndex <- createDataPartition(y_logit_m3, p = 0.8, list = FALSE)
X_train_logit_m3 <- X_logit_m3[trainIndex, ]
X_test_logit_m3 <- X_logit_m3[-trainIndex, ]
y_train_logit_m3 <- y_logit_m3[trainIndex]
y_test_logit_m3 <- y_logit_m3[-trainIndex]


# Fit the logistic regression model
logit_model_3 <- glm(y_train_logit_m3 ~ ., 
                     data = as.data.frame(X_train_logit_m3, y_train_logit_m3),
                     family = binomial())

# Display model summary
summary(logit_model_3)
## 
## Call:
## glm(formula = y_train_logit_m3 ~ ., family = binomial(), data = as.data.frame(X_train_logit_m3, 
##     y_train_logit_m3))
## 
## Coefficients:
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                    4.05066    0.86775   4.668 3.04e-06 ***
## Mkt_beta                     -96.67465   11.96738  -8.078 6.57e-16 ***
## SMB_beta                      -7.05142    7.96289  -0.886 0.375868    
## scaled_dollar_vol             -0.16817    0.24663  -0.682 0.495319    
## log_spread                    -0.45490    0.09665  -4.707 2.52e-06 ***
## log_volatility                -0.44120    0.10272  -4.295 1.74e-05 ***
## log_tracking_error             2.43435    0.22285  10.924  < 2e-16 ***
## log_turnover_ratio             0.11227    0.08640   1.299 0.193833    
## SECTORMaterials               -1.92123    0.71533  -2.686 0.007236 ** 
## SECTORIndustrials             -3.08373    0.80834  -3.815 0.000136 ***
## SECTORFinancials              -2.51658    0.66675  -3.774 0.000160 ***
## SECTORInformation_Technology  -2.30834    0.63461  -3.637 0.000275 ***
## SECTORConsumer_Discretionary  -3.53092    1.05639  -3.342 0.000830 ***
## SECTORHealth_Care             -1.11326    0.40781  -2.730 0.006336 ** 
## SECTOREnergy                  -1.85228    0.88731  -2.088 0.036840 *  
## SECTORUtilities               -2.11536    1.06523  -1.986 0.047052 *  
## SECTORReal_Estate            -15.62660  847.77716  -0.018 0.985294    
## SECTORCommunication_Services -15.75684  432.20897  -0.036 0.970918    
## SECTORUnknown                 -1.19074    0.36180  -3.291 0.000998 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1432.02  on 4151  degrees of freedom
## Residual deviance:  958.12  on 4133  degrees of freedom
## AIC: 996.12
## 
## Number of Fisher Scoring iterations: 16
# Ensure no multicollinearity
print(vif(logit_model_3))
##                     Mkt_beta                     SMB_beta 
##                     1.287699                     1.224224 
##            scaled_dollar_vol                   log_spread 
##                     1.222968                     2.780091 
##               log_volatility           log_tracking_error 
##                     3.226058                     3.154757 
##           log_turnover_ratio              SECTORMaterials 
##                     1.387073                     1.261708 
##            SECTORIndustrials             SECTORFinancials 
##                     1.189484                     1.280925 
## SECTORInformation_Technology SECTORConsumer_Discretionary 
##                     1.348813                     1.093860 
##            SECTORHealth_Care                 SECTOREnergy 
##                     2.898106                     1.199260 
##              SECTORUtilities            SECTORReal_Estate 
##                     1.096418                     1.000000 
## SECTORCommunication_Services                SECTORUnknown 
##                     1.000001                     3.754395
# Calculate predicted probabilities on testing data
pred_prob_m3 <- predict(logit_model_3, 
                        newdata = as.data.frame(X_test_logit_m3), 
                        type = "response")

# Determine classification based on threshold of 0.5
class_m3 <- ifelse(pred_prob_m3 > 0.5, 1, 0)

# Calculate True Positives, True Negatives, False Positives, False Negatives
true_positives_m3 <- sum(y_test_logit_m3 == 1 & class_m3 == 1)
true_negatives_m3 <- sum(y_test_logit_m3 == 0 & class_m3 == 0)
false_positives_m3 <- sum(y_test_logit_m3 == 0 & class_m3 == 1)
false_negatives_m3 <- sum(y_test_logit_m3 == 1 & class_m3 == 0)

# Calculate Accuracy
accuracy_m3 <- (true_positives_m3 + true_negatives_m3) / (true_positives_m3 + true_negatives_m3 + false_positives_m3 + false_negatives_m3)

# Calculate Precision
precision_m3 <- ifelse((true_positives_m3 + false_positives_m3) == 0, 0, true_positives_m3 / (true_positives_m3 + false_positives_m3))

# Calculate Recall
recall_m3 <- ifelse((true_positives_m3 + false_negatives_m3) == 0, 0, true_positives_m3 / (true_positives_m3 + false_negatives_m3))

# Calculate F1 Score
f1_m3 <- 2 * (precision_m3 * recall_m3) / (precision_m3 + recall_m3)

# Compute ROC curve and AUC
roc_curve_m3 <- roc(y_test_logit_m3, pred_prob_m3)
auc_value_m3 <- auc(roc_curve_m3)

# Calculate McFadden’s R-squared
null_model_3 <- glm(y_train_logit_m3 ~ 1, data = as.data.frame(cbind(X_train_logit_m3, y_train_logit_m3)), family = binomial())
rsquared_logit_m3 <- 1 - (logLik(logit_model_3) / logLik(null_model_3))

# Create a dataframe for counts
metrics_counts_logit_m3 <- data.frame(
  Metric = c('True Positives', 'True Negatives', 'False Positives', 'False Negatives'),
  Count = c(
    as.integer(true_positives_m3), 
    as.integer(true_negatives_m3), 
    as.integer(false_positives_m3), 
    as.integer(false_negatives_m3)
  )
)

# Create a dataframe for performance metrics
metrics_evaluation_logit_m3 <- data.frame(
  Metric = c('Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC', 'R_squared'),
  Value = c(
    accuracy_m3, 
    precision_m3, 
    recall_m3,
    f1_m3,
    auc_value_m3,
    rsquared_logit_m3
  )
)

# Print the results
print(metrics_counts_logit_m3)
##            Metric Count
## 1  True Positives    11
## 2  True Negatives  1011
## 3 False Positives     2
## 4 False Negatives    14
print(metrics_evaluation_logit_m3)
##      Metric     Value
## 1  Accuracy 0.9845857
## 2 Precision 0.8461538
## 3    Recall 0.4400000
## 4  F1 Score 0.5789474
## 5       AUC 0.8986772
## 6 R_squared 0.3309308

Confusion Matrix

true_false_m3 <- matrix(c(true_negatives_m3, false_positives_m3, false_negatives_m3, true_positives_m3),
                        nrow = 2,
                        dimnames = list(Predicted = c("Negative", "Positive"),
                                        Actual = c("Negative", "Positive")))
                                             
conf_matrix_m3 <- as.data.frame(as.table(true_false_m3))

# Plot the confusion matrix
palette <- brewer.pal(n = 4, 
                      name = "Greens")  # Define color

conf_matrix_m3_plot <- ggplot(conf_matrix_m3, 
                              aes(x = Predicted, 
                                  y = Actual, 
                                  fill = Freq)) +
  
  geom_tile(color = "black") + 
  
  geom_text(aes(label = Freq), 
            color = "black", 
            size = 6) +
  
  scale_fill_gradientn(colors = brewer.pal(n = 4, 
                                           name = "Greens"), 
                       guide = "none") +
  
  labs(title = "Confusion Matrix for Model 3: Price Increase Predictions between February 14, 2020 to March 20, 2020",
       subtitle = "(Model 3: Different Continuous Variables with Industry Variable Compared to Model 2)",
       x = "Predicted", 
       y = "Actual") +
  
  theme_minimal() +
  
  theme(
      plot.title = element_text(size = 16, 
                                face = "bold", 
                                hjust = 0.5),
      axis.title = element_text(size = 14),
      axis.text = element_text(size = 12)
    )

print(conf_matrix_m3_plot)  

# Save the plot for report
save_plot(conf_matrix_m3_plot, "Confusion Matrix for logit Model 3")

Important Factor

# Extract coefficients
coefficients_logit_m3 <- coef(logit_model_3)

# Convert coefficients to odds ratios
odds_ratios_m3 <- exp(coefficients_logit_m3)

# Identify intercept and filter it out
importance_logit_m3 <- data.frame(
  Factors = names(coefficients_logit_m3),
  Odds_Ratio = odds_ratios_m3
)

# Exclude the intercept (assuming it's named 'Intercept')
importance_logit_m3 <- importance_logit_m3[importance_logit_m3$Factor != "(Intercept)", ]

# Add a column for fill color based on the odds ratio
importance_logit_m3$Fill_Color <- ifelse(importance_logit_m3$Odds_Ratio > 1, "steelblue", "coral")

# Plot with positive and negative impacts
# Plot the factor
ggplot(importance_logit_m3, 
       aes(x = reorder(Factors, Odds_Ratio), 
           y = Odds_Ratio, 
           fill = Fill_Color)) +
  
  geom_bar(stat = "identity") +
  
  coord_flip() +
  
  geom_text(aes(label = round(Odds_Ratio, 2)), 
            hjust = 1, 
            size = 3) + 
  
  labs(title = "Impact of Factors on Price increase likelihood for Model 3: Positive Vs Negative Associations",
       subtitle = "((Model 3: Different Continuous Variables with Industry Variable Compared to Model 2))",
       x = "Factors",
       y = "Odds Ratio\n(The exponentiation of coefficients)") +
  
  geom_hline(yintercept = 1, 
             linetype = "dashed", 
             color = "red") +
  
  annotate("text", 
           x = Inf, 
           y = 1, 
           label = "Odds Ratio = 1", 
           hjust = 0, 
           vjust = 0, 
           color = "grey", 
           size = 3.5, 
           fontface = "italic") +
  
  theme_minimal() +
  
  scale_fill_manual(
    values = c("steelblue" = "steelblue", "coral" = "coral"),
    labels = c("steelblue" = "Positively Associated with Price Increase", 
               "coral" = "Negatively Associated with Price Increase"),
    name = "") +
  
  theme(
    plot.title = element_text(size = 16, 
                              face = "bold", 
                              hjust = 0.5),
    axis.title = element_text(size = 14),
    axis.text = element_text(size = 12),
    axis.title.y = element_text(color = 'black'),
    axis.text.y = element_text(color = 'black'),
    legend.position = 'bottom'
  ) 

Model 4: Continuous Variables from Model 3 without Industry Variable

# Select independent variable
X_logit_m4 <- task3 %>%
  select(Mkt_beta, SMB_beta, scaled_dollar_vol, log_spread, log_volatility, log_tracking_error, log_turnover_ratio)

y_logit_m4 <- task3$up_PRC  

# Split the dataset into training and testing sets
set.seed(675)

trainIndex <- createDataPartition(y_logit_m4, p = 0.8, list = FALSE)
X_train_logit_m4 <- X_logit_m4[trainIndex, ]
X_test_logit_m4 <- X_logit_m4[-trainIndex, ]
y_train_logit_m4 <- y_logit_m4[trainIndex]
y_test_logit_m4 <- y_logit_m4[-trainIndex]


# Fit the logistic regression model
logit_model_4 <- glm(y_train_logit_m4 ~ ., 
                     data = as.data.frame(X_train_logit_m4, y_train_logit_m4),
                     family = binomial())

# Display model summary
summary(logit_model_4)
## 
## Call:
## glm(formula = y_train_logit_m4 ~ ., family = binomial(), data = as.data.frame(X_train_logit_m4, 
##     y_train_logit_m4))
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           2.93863    0.66511   4.418 9.95e-06 ***
## Mkt_beta           -102.74998   11.66734  -8.807  < 2e-16 ***
## SMB_beta             -5.73941    7.76574  -0.739   0.4599    
## scaled_dollar_vol    -0.14856    0.22590  -0.658   0.5108    
## log_spread           -0.40899    0.09140  -4.475 7.65e-06 ***
## log_volatility       -0.50583    0.09463  -5.345 9.03e-08 ***
## log_tracking_error    2.34476    0.21631  10.840  < 2e-16 ***
## log_turnover_ratio    0.19554    0.08185   2.389   0.0169 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1432.0  on 4151  degrees of freedom
## Residual deviance: 1011.9  on 4144  degrees of freedom
## AIC: 1027.9
## 
## Number of Fisher Scoring iterations: 7
# Ensure no multicollinearity
vif(logit_model_4)
##           Mkt_beta           SMB_beta  scaled_dollar_vol         log_spread 
##           1.248160           1.155427           1.178263           2.626577 
##     log_volatility log_tracking_error log_turnover_ratio 
##           2.661042           3.058806           1.297536
# Calculate predicted probabilities on testing data
pred_prob_m4 <- predict(logit_model_4, 
                        newdata = as.data.frame(X_test_logit_m4), 
                        type = "response")

# Determine classification based on threshold of 0.5
class_m4 <- ifelse(pred_prob_m4 > 0.5, 1, 0)

# Calculate True Positives, True Negatives, False Positives, False Negatives
true_positives_m4 <- sum(y_test_logit_m4 == 1 & class_m4 == 1)
true_negatives_m4 <- sum(y_test_logit_m4 == 0 & class_m4 == 0)
false_positives_m4 <- sum(y_test_logit_m4 == 0 & class_m4 == 1)
false_negatives_m4 <- sum(y_test_logit_m4 == 1 & class_m4 == 0)

# Calculate Accuracy
accuracy_m4 <- (true_positives_m4 + true_negatives_m4) / (true_positives_m4 + true_negatives_m4 + false_positives_m4 + false_negatives_m4)

# Calculate Precision
precision_m4 <- ifelse((true_positives_m4 + false_positives_m4) == 0, 0, true_positives_m4 / (true_positives_m4 + false_positives_m4))

# Calculate Recall
recall_m4 <- ifelse((true_positives_m4 + false_negatives_m4) == 0, 0, true_positives_m4 / (true_positives_m4 + false_negatives_m4))

# Calculate F1 Score
f1_m4 <- 2 * (precision_m4 * recall_m4) / (precision_m4 + recall_m4)

# Compute ROC curve and AUC
roc_curve_m4 <- roc(y_test_logit_m4, pred_prob_m4)
auc_value_m4 <- auc(roc_curve_m4)

# Calculate McFadden’s R-squared
null_model_4 <- glm(y_train_logit_m4 ~ 1, data = as.data.frame(cbind(X_train_logit_m4, y_train_logit_m4)), family = binomial())
rsquared_logit_m4 <- 1 - (logLik(logit_model_4) / logLik(null_model_4))

# Create a dataframe for counts
metrics_counts_logit_m4 <- data.frame(
  Metric = c('True Positives', 'True Negatives', 'False Positives', 'False Negatives'),
  Count = c(
    as.integer(true_positives_m4), 
    as.integer(true_negatives_m4), 
    as.integer(false_positives_m4), 
    as.integer(false_negatives_m4)
  )
)

# Create a dataframe for performance metrics
metrics_evaluation_logit_m4 <- data.frame(
  Metric = c('Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC', 'R_squared'),
  Value = c(
    accuracy_m4, 
    precision_m4, 
    recall_m4,
    f1_m4,
    auc_value_m4,
    rsquared_logit_m4
  )
)

# Print the results
print(metrics_counts_logit_m4)
##            Metric Count
## 1  True Positives     9
## 2  True Negatives  1011
## 3 False Positives     2
## 4 False Negatives    16
print(metrics_evaluation_logit_m4)
##      Metric     Value
## 1  Accuracy 0.9826590
## 2 Precision 0.8181818
## 3    Recall 0.3600000
## 4  F1 Score 0.5000000
## 5       AUC 0.9169990
## 6 R_squared 0.2933615

Confusion Matrix

true_false_m4 <- matrix(c(true_negatives_m4, false_positives_m4, false_negatives_m4, true_positives_m4),
                        nrow = 2,
                        dimnames = list(Predicted = c("Negative", "Positive"),
                                        Actual = c("Negative", "Positive")))
                                             
conf_matrix_m4 <- as.data.frame(as.table(true_false_m4))

# Plot the confusion matrix
palette <- brewer.pal(n = 4, 
                      name = "Greens")  # Define color

conf_matrix_m4_plot <- ggplot(conf_matrix_m4, 
                              aes(x = Predicted, 
                                  y = Actual, 
                                  fill = Freq)) +
  
  geom_tile(color = "black") + 
  
  geom_text(aes(label = Freq), 
            color = "black", 
            size = 6) +
  
  scale_fill_gradientn(colors = brewer.pal(n = 4, 
                                           name = "Greens"), 
                       guide = "none") +
  
  labs(title = "Confusion Matrix for Model 4: Price Increase Predictions between February 14, 2020 to March 20, 2020",
       subtitle = "(Model 4: Continuous Variables from Model 3 without Industry Variable)",
       x = "Predicted", 
       y = "Actual") +
  
  theme_minimal() +
  
  theme(
      plot.title = element_text(size = 16, 
                                face = "bold", 
                                hjust = 0.5),
      axis.title = element_text(size = 14),
      axis.text = element_text(size = 12)
    )

print(conf_matrix_m4_plot)  

# Save the plot for report
save_plot(conf_matrix_m4_plot, "Confusion Matrix for logit Model 4")

Important Factor

# Extract coefficients
coefficients_logit_m4 <- coef(logit_model_4)

# Convert coefficients to odds ratios
odds_ratios_m4 <- exp(coefficients_logit_m4)

# Identify intercept and filter it out
importance_logit_m4 <- data.frame(
  Factors = names(coefficients_logit_m4),
  Odds_Ratio = odds_ratios_m4
)

# Exclude the intercept (assuming it's named 'Intercept')
importance_logit_m4 <- importance_logit_m4[importance_logit_m4$Factor != "(Intercept)", ]

# Add a column for fill color based on the odds ratio
importance_logit_m4$Fill_Color <- ifelse(importance_logit_m4$Odds_Ratio > 1, "steelblue", "coral")

# Plot with positive and negative impacts
# Plot the factor
ggplot(importance_logit_m4, 
       aes(x = reorder(Factors, Odds_Ratio), 
           y = Odds_Ratio, 
           fill = Fill_Color)) +
  
  geom_bar(stat = "identity") +
  
  coord_flip() +
  
  geom_text(aes(label = round(Odds_Ratio, 2)), 
            hjust = 1, 
            size = 3) + 
  
  labs(title = "Impact of Factors on Price increase likelihood for Model 4: Positive Vs Negative Associations",
       subtitle = "(Model 4: Continuous Variables from Model 3 without Industry Variable)", 
       x = "Factors",
       y = "Odds Ratio\n(The exponentiation of coefficients)") +
  
  geom_hline(yintercept = 1, 
             linetype = "dashed", 
             color = "red") +
  
  annotate("text", 
           x = Inf, 
           y = 1, 
           label = "Odds Ratio = 1", 
           hjust = 0, 
           vjust = 0, 
           color = "grey", 
           size = 3.5, 
           fontface = "italic") +
  
  theme_minimal() +
  
  scale_fill_manual(
    values = c("steelblue" = "steelblue", "coral" = "coral"),
    labels = c("steelblue" = "Positively Associated with Price Increase", 
               "coral" = "Negatively Associated with Price Increase"),
    name = "") +
  
  theme(
    plot.title = element_text(size = 16, 
                              face = "bold", 
                              hjust = 0.5),
    axis.title = element_text(size = 14),
    axis.text = element_text(size = 12),
    axis.title.y = element_text(color = 'black'),
    axis.text.y = element_text(color = 'black'),
    legend.position = 'bottom'
  ) 

Model 5: Same variables with Model 1 and with Type of security variable

# Select independent variable
X_logit_m5 <- task3 %>% 
  select(Mkt_beta, SMB_beta, HML_beta, log_spread, log_volatility, scaled_market_cap, log_tracking_error, log_turnover_ratio, security_typeETF)

y_logit_m5 <- task3$up_PRC  

# Split the dataset into training and testing sets
set.seed(675) 

trainIndex <- createDataPartition(y_logit_m5, p = 0.8, list = FALSE)
X_train_logit_m5 <- X_logit_m5[trainIndex, ]
X_test_logit_m5 <- X_logit_m5[-trainIndex, ]
y_train_logit_m5 <- y_logit_m5[trainIndex]
y_test_logit_m5 <- y_logit_m5[-trainIndex]


# Fit the logistic regression model
logit_model_5 <- glm(y_train_logit_m5 ~ ., 
                     data = as.data.frame(X_train_logit_m5, y_train_logit_m5),
                     family = binomial())

# Display model summary
summary(logit_model_5)
## 
## Call:
## glm(formula = y_train_logit_m5 ~ ., family = binomial(), data = as.data.frame(X_train_logit_m5, 
##     y_train_logit_m5))
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          2.29389    0.72950   3.144  0.00166 ** 
## Mkt_beta           -82.15234   12.30411  -6.677 2.44e-11 ***
## SMB_beta            -2.70889    7.68745  -0.352  0.72455    
## HML_beta            -1.55521    9.92702  -0.157  0.87551    
## log_spread          -0.48394    0.09657  -5.011 5.41e-07 ***
## log_volatility      -0.22405    0.11883  -1.885  0.05938 .  
## scaled_market_cap   -0.09525    0.29013  -0.328  0.74269    
## log_tracking_error   2.60729    0.22094  11.801  < 2e-16 ***
## log_turnover_ratio   0.01454    0.09322   0.156  0.87606    
## security_typeETF     1.44861    0.34167   4.240 2.24e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1432.02  on 4151  degrees of freedom
## Residual deviance:  993.57  on 4142  degrees of freedom
## AIC: 1013.6
## 
## Number of Fisher Scoring iterations: 8
# Calculate predicted probabilities on testing data
pred_prob_m5 <- predict(logit_model_5, 
                        newdata = as.data.frame(X_test_logit_m5), 
                        type = "response")

# Determine classification based on threshold of 0.5
class_m5 <- ifelse(pred_prob_m5 > 0.5, 1, 0)

# Calculate True Positives, True Negatives, False Positives, False Negatives
true_positives_m5 <- sum(y_test_logit_m5 == 1 & class_m5 == 1)
true_negatives_m5 <- sum(y_test_logit_m5 == 0 & class_m5 == 0)
false_positives_m5 <- sum(y_test_logit_m5 == 0 & class_m5 == 1)
false_negatives_m5 <- sum(y_test_logit_m5 == 1 & class_m5 == 0)

# Calculate Accuracy
accuracy_m5 <- (true_positives_m5 + true_negatives_m5) / (true_positives_m5 + true_negatives_m5 + false_positives_m5 + false_negatives_m5)

# Calculate Precision
precision_m5 <- ifelse((true_positives_m5 + false_positives_m5) == 0, 0, true_positives_m5 / (true_positives_m5 + false_positives_m5))

# Calculate Recall
recall_m5 <- ifelse((true_positives_m5 + false_negatives_m5) == 0, 0, true_positives_m5 / (true_positives_m5 + false_negatives_m5))

# Calculate F1 Score
f1_m5 <- 2 * (precision_m5 * recall_m5) / (precision_m5 + recall_m5)

# Compute ROC curve and AUC
roc_curve_m5 <- roc(y_test_logit_m5, pred_prob_m5)
auc_value_m5 <- auc(roc_curve_m5)

# Calculate McFadden’s R-squared
null_model_5 <- glm(y_train_logit_m5 ~ 1, data = as.data.frame(cbind(X_train_logit_m5, y_train_logit_m5)), family = binomial())
rsquared_logit_m5 <- 1 - (logLik(logit_model_5) / logLik(null_model_5))

# Create a dataframe for counts
metrics_counts_logit_m5 <- data.frame(
  Metric = c('True Positives', 'True Negatives', 'False Positives', 'False Negatives'),
  Count = c(
    as.integer(true_positives_m5), 
    as.integer(true_negatives_m5), 
    as.integer(false_positives_m5), 
    as.integer(false_negatives_m5)
  )
)

# Create a dataframe for performance metrics
metrics_evaluation_logit_m5 <- data.frame(
  Metric = c('Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC', 'R_squared'),
  Value = c(
    accuracy_m5, 
    precision_m5, 
    recall_m5,
    f1_m5,
    auc_value_m5,
    rsquared_logit_m5
  )
)

# Print the results
print(metrics_counts_logit_m5)
##            Metric Count
## 1  True Positives     9
## 2  True Negatives  1011
## 3 False Positives     2
## 4 False Negatives    16
print(metrics_evaluation_logit_m5)
##      Metric     Value
## 1  Accuracy 0.9826590
## 2 Precision 0.8181818
## 3    Recall 0.3600000
## 4  F1 Score 0.5000000
## 5       AUC 0.9130109
## 6 R_squared 0.3061749

Confusion Matrix

true_false_m5 <- matrix(c(true_negatives_m5, false_positives_m5, false_negatives_m5, true_positives_m5),
                        nrow = 2,
                        dimnames = list(Predicted = c("Negative", "Positive"),
                                        Actual = c("Negative", "Positive")))
                                             
conf_matrix_m5 <- as.data.frame(as.table(true_false_m5))

# Plot the confusion matrix
palette <- brewer.pal(n = 4, 
                      name = "Greens")  # Define color

conf_matrix_m5_plot <- ggplot(conf_matrix_m5, 
                              aes(x = Predicted, 
                                  y = Actual, 
                                  fill = Freq)) +
  
  geom_tile(color = "black") + 
  
  geom_text(aes(label = Freq), 
            color = "black", 
            size = 6) +
  
  scale_fill_gradientn(colors = brewer.pal(n = 4, 
                                           name = "Greens"), 
                       guide = "none") +
  
  labs(title = "Confusion Matrix for Model 5: Price Increase Predictions between February 14, 2020 to March 20, 2020",
       subtitle = "(Model 5: Same variables with Model 1 and with Type of security variable)",
       x = "Predicted", 
       y = "Actual") +
  
  theme_minimal() +
  
  theme(
      plot.title = element_text(size = 16, 
                                face = "bold", 
                                hjust = 0.5),
      axis.title = element_text(size = 14),
      axis.text = element_text(size = 12)
    )

print(conf_matrix_m5_plot)  

# Save the plot for report
save_plot(conf_matrix_m5_plot, "Confusion Matrix for logit Model 5")

Important Factor

# Extract coefficients
coefficients_logit_m5 <- coef(logit_model_5)

# Convert coefficients to odds ratios
odds_ratios_m5 <- exp(coefficients_logit_m5)

# Identify intercept and filter it out
importance_logit_m5 <- data.frame(
  Factors = names(coefficients_logit_m5),
  Odds_Ratio = odds_ratios_m5
)

# Exclude the intercept (assuming it's named 'Intercept')
importance_logit_m5 <- importance_logit_m5[importance_logit_m5$Factor != "(Intercept)", ]

# Add a column for fill color based on the odds ratio
importance_logit_m5$Fill_Color <- ifelse(importance_logit_m5$Odds_Ratio > 1, "steelblue", "coral")

# Plot the factor
ggplot(importance_logit_m5, 
       aes(x = reorder(Factors, Odds_Ratio), 
           y = Odds_Ratio, 
           fill = Fill_Color)) +
  
  geom_bar(stat = "identity") +
  
  coord_flip() +
  
  geom_text(aes(label = round(Odds_Ratio, 2)), 
            hjust = 1, 
            size = 3) + 
  
  labs(title = "Impact of Factors on Price increase likelihood for Model 5: Positive Vs Negative Associations",
       subtitle = "(Model 5: Continuous Variables from Model 1 with type of security variable)", 
       x = "Factors",
       y = "Odds Ratio\n(The exponentiation of coefficients)") +
  
  geom_hline(yintercept = 1, 
             linetype = "dashed", 
             color = "red") +
  
  annotate("text", 
           x = Inf, 
           y = 1, 
           label = "Odds Ratio = 1", 
           hjust = 0, 
           vjust = 0, 
           color = "grey", 
           size = 3.5, 
           fontface = "italic") +
  
  theme_minimal() +
  
  scale_fill_manual(
    values = c("steelblue" = "steelblue", "coral" = "coral"),
    labels = c("steelblue" = "Positively Associated with Price Increase", 
               "coral" = "Negatively Associated with Price Increase"),
    name = "") +
  
  theme(
    plot.title = element_text(size = 16, 
                              face = "bold", 
                              hjust = 0.5),
    axis.title = element_text(size = 14),
    axis.text = element_text(size = 12),
    axis.title.y = element_text(color = 'black'),
    axis.text.y = element_text(color = 'black'),
    legend.position = 'bottom'
  ) 

Model 6: Same variables with Model 3 and with Type of security variable

# Select independent variable
X_logit_m6 <- task3 %>%
  select(Mkt_beta, SMB_beta, scaled_dollar_vol, log_spread, log_volatility, log_tracking_error, log_turnover_ratio, SECTORMaterials, SECTORIndustrials, SECTORFinancials, SECTORInformation_Technology, SECTORConsumer_Discretionary, SECTORHealth_Care, SECTOREnergy, SECTORUtilities, SECTORReal_Estate, SECTORCommunication_Services, SECTORUnknown, security_typeETF)

y_logit_m6 <- task3$up_PRC  

# Split the dataset into training and testing sets
set.seed(675)  

trainIndex <- createDataPartition(y_logit_m6, p = 0.8, list = FALSE)
X_train_logit_m6 <- X_logit_m6[trainIndex, ]
X_test_logit_m6 <- X_logit_m6[-trainIndex, ]
y_train_logit_m6 <- y_logit_m6[trainIndex]
y_test_logit_m6 <- y_logit_m6[-trainIndex]


# Fit the logistic regression model
logit_model_6 <- glm(y_train_logit_m6 ~ ., 
                     data = as.data.frame(X_train_logit_m6, y_train_logit_m6),
                     family = binomial())

# Display model summary
summary(logit_model_6)
## 
## Call:
## glm(formula = y_train_logit_m6 ~ ., family = binomial(), data = as.data.frame(X_train_logit_m6, 
##     y_train_logit_m6))
## 
## Coefficients:
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                    4.16952    0.88471   4.713 2.44e-06 ***
## Mkt_beta                     -79.81355   12.37080  -6.452 1.11e-10 ***
## SMB_beta                      -4.09127    8.04088  -0.509 0.610887    
## scaled_dollar_vol             -0.17401    0.24324  -0.715 0.474380    
## log_spread                    -0.49625    0.10032  -4.947 7.55e-07 ***
## log_volatility                -0.24133    0.11928  -2.023 0.043059 *  
## log_tracking_error             2.70666    0.23395  11.569  < 2e-16 ***
## log_turnover_ratio            -0.01594    0.09444  -0.169 0.865986    
## SECTORMaterials               -1.94865    0.70959  -2.746 0.006029 ** 
## SECTORIndustrials             -3.32320    0.82387  -4.034 5.49e-05 ***
## SECTORFinancials              -2.56701    0.66797  -3.843 0.000122 ***
## SECTORInformation_Technology  -2.49134    0.64171  -3.882 0.000103 ***
## SECTORConsumer_Discretionary  -3.65068    1.05825  -3.450 0.000561 ***
## SECTORHealth_Care             -1.37650    0.41603  -3.309 0.000937 ***
## SECTOREnergy                  -2.25860    0.92833  -2.433 0.014975 *  
## SECTORUtilities               -1.88301    1.07049  -1.759 0.078574 .  
## SECTORReal_Estate            -15.69765  849.22171  -0.018 0.985252    
## SECTORCommunication_Services -16.01693  422.86756  -0.038 0.969786    
## SECTORUnknown                 -1.72958    0.39903  -4.334 1.46e-05 ***
## security_typeETF               1.41723    0.39070   3.627 0.000286 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1432.02  on 4151  degrees of freedom
## Residual deviance:  944.25  on 4132  degrees of freedom
## AIC: 984.25
## 
## Number of Fisher Scoring iterations: 16
# Calculate predicted probabilities on testing data
pred_prob_m6 <- predict(logit_model_6, 
                        newdata = as.data.frame(X_test_logit_m6), 
                        type = "response")

# Determine classification based on threshold of 0.5
class_m6 <- ifelse(pred_prob_m6 > 0.5, 1, 0)

# Calculate True Positives, True Negatives, False Positives, False Negatives
true_positives_m6 <- sum(y_test_logit_m6 == 1 & class_m6 == 1)
true_negatives_m6 <- sum(y_test_logit_m6 == 0 & class_m6 == 0)
false_positives_m6 <- sum(y_test_logit_m6 == 0 & class_m6 == 1)
false_negatives_m6 <- sum(y_test_logit_m6 == 1 & class_m6 == 0)

# Calculate Accuracy
accuracy_m6 <- (true_positives_m6 + true_negatives_m6) / (true_positives_m6 + true_negatives_m6 + false_positives_m6 + false_negatives_m6)

# Calculate Precision
precision_m6 <- ifelse((true_positives_m6 + false_positives_m6) == 0, 0, true_positives_m6 / (true_positives_m6 + false_positives_m6))

# Calculate Recall
recall_m6 <- ifelse((true_positives_m6 + false_negatives_m6) == 0, 0, true_positives_m6 / (true_positives_m6 + false_negatives_m6))

# Calculate F1 Score
f1_m6 <- 2 * (precision_m6 * recall_m6) / (precision_m6 + recall_m6)

# Compute ROC curve and AUC
roc_curve_m6 <- roc(y_test_logit_m6, pred_prob_m6)
auc_value_m6 <- auc(roc_curve_m6)

# Calculate McFadden’s R-squared
null_model_6 <- glm(y_train_logit_m6 ~ 1, data = as.data.frame(cbind(X_train_logit_m6, y_train_logit_m6)), family = binomial())
rsquared_logit_m6 <- 1 - (logLik(logit_model_6) / logLik(null_model_6))

# Create a dataframe for counts
metrics_counts_logit_m6 <- data.frame(
  Metric = c('True Positives', 'True Negatives', 'False Positives', 'False Negatives'),
  Count = c(
    as.integer(true_positives_m6), 
    as.integer(true_negatives_m6), 
    as.integer(false_positives_m6), 
    as.integer(false_negatives_m6)
  )
)

# Create a dataframe for performance metrics
metrics_evaluation_logit_m6 <- data.frame(
  Metric = c('Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC', 'R_squared'),
  Value = c(
    accuracy_m6, 
    precision_m6, 
    recall_m6,
    f1_m6,
    auc_value_m6,
    rsquared_logit_m6
  )
)

# Print the results
print(metrics_counts_logit_m6)
##            Metric Count
## 1  True Positives    11
## 2  True Negatives  1010
## 3 False Positives     3
## 4 False Negatives    14
print(metrics_evaluation_logit_m6)
##      Metric     Value
## 1  Accuracy 0.9836224
## 2 Precision 0.7857143
## 3    Recall 0.4400000
## 4  F1 Score 0.5641026
## 5       AUC 0.8898322
## 6 R_squared 0.3406185

Confusion Matrix

true_false_m6 <- matrix(c(true_negatives_m6, false_positives_m6, false_negatives_m6, true_positives_m6),
                        nrow = 2,
                        dimnames = list(Predicted = c("Negative", "Positive"),
                                        Actual = c("Negative", "Positive")))
                                             
conf_matrix_m6 <- as.data.frame(as.table(true_false_m6))

# Plot the confusion matrix
palette <- brewer.pal(n = 4, 
                      name = "Greens")  # Define color

conf_matrix_m6_plot <- ggplot(conf_matrix_m6, 
                              aes(x = Predicted, 
                                  y = Actual, 
                                  fill = Freq)) +
  
  geom_tile(color = "black") + 
  
  geom_text(aes(label = Freq), 
            color = "black", 
            size = 6) +
  
  scale_fill_gradientn(colors = brewer.pal(n = 4, 
                                           name = "Greens"), 
                       guide = "none") +
  
  labs(title = "Confusion Matrix for Model 6: Price Increase Predictions between February 14, 2020 to March 20, 2020",
       subtitle = "(Model 6: Same variables with Model 3 and with Type of security variable)",
       x = "Predicted", 
       y = "Actual") +
  
  theme_minimal() +
  
  theme(
      plot.title = element_text(size = 16, 
                                face = "bold", 
                                hjust = 0.5),
      axis.title = element_text(size = 14),
      axis.text = element_text(size = 12)
    )

print(conf_matrix_m6_plot)  

# Save the plot for report
save_plot(conf_matrix_m6_plot, "Confusion Matrix for logit Model 6")

Important Factor

# Extract coefficients
coefficients_logit_m6 <- coef(logit_model_6)

# Convert coefficients to odds ratios
odds_ratios_m6 <- exp(coefficients_logit_m6)

# Identify intercept and filter it out
importance_logit_m6 <- data.frame(
  Factors = names(coefficients_logit_m6),
  Odds_Ratio = odds_ratios_m6
)

# Exclude the intercept (assuming it's named 'Intercept')
importance_logit_m6 <- importance_logit_m6[importance_logit_m6$Factor != "(Intercept)", ]

# Add a column for fill color based on the odds ratio
importance_logit_m6$Fill_Color <- ifelse(importance_logit_m6$Odds_Ratio > 1, "steelblue", "coral")

# Plot with positive and negative impacts
# Plot the factor
ggplot(importance_logit_m6, 
       aes(x = reorder(Factors, Odds_Ratio), 
           y = Odds_Ratio, 
           fill = Fill_Color)) +
  
  geom_bar(stat = "identity") +
  
  coord_flip() +
  
  geom_text(aes(label = round(Odds_Ratio, 2)), 
            hjust = 1, 
            size = 3) + 
  
  labs(title = "Impact of Factors on Price increase likelihood for Model 5: Positive Vs Negative Associations",
       subtitle = "(Model 6: Same variables with Model 3 and with Type of security variable)", 
       x = "Factors",
       y = "Odds Ratio\n(The exponentiation of coefficients)") +
  
  geom_hline(yintercept = 1, 
             linetype = "dashed", 
             color = "red") +
  
  annotate("text", 
           x = Inf, 
           y = 1, 
           label = "Odds Ratio = 1", 
           hjust = 0, 
           vjust = 0, 
           color = "grey", 
           size = 3.5, 
           fontface = "italic") +
  
  theme_minimal() +
  
  scale_fill_manual(
    values = c("steelblue" = "steelblue", "coral" = "coral"),
    labels = c("steelblue" = "Positively Associated with Price Increase", 
               "coral" = "Negatively Associated with Price Increase"),
    name = "") +
  
  theme(
    plot.title = element_text(size = 16, 
                              face = "bold", 
                              hjust = 0.5),
    axis.title = element_text(size = 14),
    axis.text = element_text(size = 12),
    axis.title.y = element_text(color = 'black'),
    axis.text.y = element_text(color = 'black'),
    legend.position = 'bottom'
  ) 

Model variables difference

logit_m1_v <- get_variables(logit_model_1)
logit_m2_v <- get_variables(logit_model_2)
logit_m3_v <- get_variables(logit_model_3)
logit_m4_v <- get_variables(logit_model_4)
logit_m5_v <- get_variables(logit_model_5)
logit_m6_v <- get_variables(logit_model_6)

logit_v_list <- list(
  Model_1 = as.character(logit_m1_v),
  Model_2 = as.character(logit_m2_v),
  Model_3 = as.character(logit_m3_v),
  Model_4 = as.character(logit_m4_v),
  Model_5 = as.character(logit_m5_v),
  Model_6 = as.character(logit_m6_v)
)

# Initialise empty data frame
logit_model_data <- data.frame(
  model = character(),
  variable = character(),
  order = integer(),  
  stringsAsFactors = FALSE
)

# Populate the dataframe with the original order of variables
for (model in names(logit_v_list)) {
  variables <- logit_v_list[[model]]  # Get the variables for each model
  temp_df <- data.frame(
    model = model,
    variable = variables,
    order = seq_along(variables), 
    stringsAsFactors = FALSE
  )
  logit_model_data <- rbind(logit_model_data, temp_df)
}

# Convert variable to a factor with levels 
logit_model_data$variable <- factor(logit_model_data$variable, levels = unique(logit_model_data$variable[order(logit_model_data$order)]))

# Generate the Plot
logit_models_v <- ggplot(logit_model_data, 
                         aes(x = model, 
                             y = variable, 
                             fill = model)) +
  
  geom_tile(color = "white") + 
  
  labs(
    title = "Variables Included in Each Logistic Model",
    x = "Model",
    y = "Variables"
  ) +
  
  theme_minimal() +
  
  theme(
    axis.text.x = element_text(angle = 45, 
                               hjust = 1, 
                               size = 12),  
    axis.text.y = element_text(size = 12),  
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14), 
    plot.title = element_text(hjust = 0.5, 
                              size = 16),  
    plot.title.position = "plot")

print(logit_models_v)

# Save the plot for report
save_plot(logit_models_v, "logit_models_v")

4 K-mean Clustering

# Load library
library(randomForest)
library(cluster)
# Import data
Stock_data_2 <- read.csv("Stock_data_part2.csv")

# Make sure all the "date" are converted to ISO 8601 format
Stock_data_2$public_date <- as.Date(Stock_data_2$public_date, format = "%d/%m/%Y")
Stock_data_2$public_date <- format(Stock_data_2$public_date, format = "%Y-%m-%d")
Stock_data_2$public_date <- as.Date(Stock_data_2$public_date, format = "%Y-%m-%d")

# Inspect data
str(Stock_data_2)
## 'data.frame':    577832 obs. of  13 variables:
##  $ public_date: Date, format: "2010-01-31" "2010-02-28" ...
##  $ CAPEI      : num  17.9 17.6 19.3 18.3 14.8 ...
##  $ bm         : num  1.077 1.077 1.077 0.898 0.898 ...
##  $ evm        : num  7.98 7.98 7.98 9.29 9.29 ...
##  $ pe_exi     : num  16.4 16.1 17.6 19.3 15.6 ...
##  $ dpr        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ npm        : num  0.041 0.041 0.041 0.037 0.037 0.037 0.033 0.033 0.033 0.037 ...
##  $ roa        : num  0.095 0.095 0.095 0.086 0.086 0.086 0.093 0.093 0.093 0.078 ...
##  $ roe        : num  0.079 0.079 0.079 0.067 0.067 0.067 0.059 0.059 0.059 0.064 ...
##  $ roce       : num  0.099 0.099 0.099 0.097 0.097 0.097 0.083 0.083 0.083 0.097 ...
##  $ ptb        : num  1.152 1.128 1.237 1.197 0.968 ...
##  $ divyield   : chr  "" "" "" "" ...
##  $ TICKER     : chr  "AIR" "AIR" "AIR" "AIR" ...
head(Stock_data_2, n = 5)
##   public_date  CAPEI    bm   evm pe_exi dpr   npm   roa   roe  roce   ptb
## 1  2010-01-31 17.934 1.077 7.981 16.433   0 0.041 0.095 0.079 0.099 1.152
## 2  2010-02-28 17.554 1.077 7.981 16.085   0 0.041 0.095 0.079 0.099 1.128
## 3  2010-03-31 19.257 1.077 7.981 17.603   0 0.041 0.095 0.079 0.099 1.237
## 4  2010-04-30 18.304 0.898 9.293 19.349   0 0.037 0.086 0.067 0.097 1.197
## 5  2010-05-31 14.791 0.898 9.293 15.635   0 0.037 0.086 0.067 0.097 0.968
##   divyield TICKER
## 1             AIR
## 2             AIR
## 3             AIR
## 4             AIR
## 5             AIR

Select the two most important variables for clustering

# Create a cross-sectional dataset
task4_select_ratios <- na.omit(
  Stock_data_2 %>%
    filter(TICKER != "") %>%
    group_by(TICKER) %>%
    summarise(
      TICKER = first(TICKER),
      CAPEI = median(CAPEI, na.rm = TRUE),
      bm = median(bm, na.rm = TRUE),
      evm = median(evm, na.rm = TRUE),
      pe_exi = median(pe_exi, na.rm = TRUE),
      dpr = median(dpr, na.rm = TRUE),
      npm = median(npm, na.rm = TRUE),
      roa = median(roa, na.rm = TRUE),
      roe = median(roe, na.rm = TRUE),
      roce = median(roce, na.rm = TRUE),
      ptb = median(ptb, na.rm = TRUE),
      divyield = median(as.numeric(sub("%", "", divyield)), na.rm = TRUE),
      .groups = "drop"
    )) %>% 
  select(-TICKER)


# Random Forest Importance
task4_rf_model <- randomForest(x = task4_select_ratios, y = NULL, ntree = 100)
task4_select_ratios_rf <- importance(task4_rf_model)
task4_rf_df <- data.frame(Feature = rownames(task4_select_ratios_rf), 
                             MeanDecreaseGini = task4_select_ratios_rf[, "MeanDecreaseGini"])

# Plot the result
task4_select <- ggplot(task4_rf_df, 
                       aes(x = reorder(Feature, MeanDecreaseGini), 
                           y = MeanDecreaseGini)) +
  
  geom_bar(stat = "identity", fill = "lightblue") +
  
  coord_flip() +
  
  labs(title = "Feature Importance of financial ratio", 
       x = "Financial Ratio", 
       y = "Mean Decrease in Gini") +
  
  theme_minimal()+ 
  
    theme(
      plot.title = element_text(size = 16, 
                                face = "bold", 
                                hjust = 0.5),
      axis.title = element_text(size = 14),
      axis.text = element_text(size = 12)
    )

print(task4_select)

# Save the plot for report
save_plot(task4_select, "task4_select")

Create function to transform dataset to cross-sectional

task4_dataset_transformation <- function(data, filter_date, output_name_cleaned, output_name_clustering) {
  
  aggregated_data <- data %>%
    filter(TICKER != "") %>%
    filter(public_date < filter_date) %>% 
    group_by(TICKER) %>%
    summarise(
      TICKER = first(TICKER),
      # get the median (avoid outlier affect the result)
      bm = median(bm, na.rm = TRUE),
      roe = median(roe, na.rm = TRUE),
      .groups = "drop"
    )
  
  print(head(aggregated_data, n = 5))
  
  # Remove outliers for B/M ratio by IQR method
  Q1_BM <- quantile(aggregated_data$bm, 0.25, na.rm = TRUE)
  Q3_BM <- quantile(aggregated_data$bm, 0.75, na.rm = TRUE)
  IQR_BM <- Q3_BM - Q1_BM
  cleaned_data <- aggregated_data[aggregated_data$bm > (Q1_BM - 1.5 * IQR_BM) & 
                                     aggregated_data$bm < (Q3_BM + 1.5 * IQR_BM), ]
  
  # Remove outliers for ROE with IQR method
  Q1_ROE <- quantile(cleaned_data$roe, 0.25, na.rm = TRUE)
  Q3_ROE <- quantile(cleaned_data$roe, 0.75, na.rm = TRUE)
  IQR_ROE <- Q3_ROE - Q1_ROE
  cleaned_data <- cleaned_data[cleaned_data$roe > (Q1_ROE - 1.5 * IQR_ROE) & 
                                 cleaned_data$roe < (Q3_ROE + 1.5 * IQR_ROE), ]
  
  # Join with returns data - calculate return using the price of the last day and the first day from the period to get the % of price change
  cluster_data <- cleaned_data %>%
    left_join(task2 %>%  
                filter(date >= "2019-12-14" & date <= "2020-01-20") %>%
                group_by(TICKER) %>%
                summarise(
                  noncovid_RET = (PRC[which.max(date)] - PRC[which.min(date)]) / PRC[which.min(date)]), 
              by = "TICKER") %>%
    
    left_join(task2 %>%  
                filter(date >= "2020-02-14" & date <= "2020-03-20") %>%
                group_by(TICKER) %>%
                summarise(
                  COVID_RET = (PRC[which.max(date)] - PRC[which.min(date)]) / PRC[which.min(date)]),
              by = "TICKER") %>% 
    
    left_join(task2 %>%  
                filter(date >= "2019-12-14" & date <= "2020-01-20") %>%
                group_by(TICKER) %>%
                summarise(
                  noncovid_sharpe_ratio = mean(sharpe_ratio, na.rm = TRUE),
                  noncovid_volatility = mean(volatility, na.rm = TRUE)
                ), 
              by = "TICKER") %>% 
    
    left_join(task2 %>%  
                filter(date >= "2020-02-14" & date <= "2020-03-20") %>%
                group_by(TICKER) %>%
                summarise(
                  covid_sharpe_ratio = mean(sharpe_ratio, na.rm = TRUE),
                  covid_volatility = mean(volatility, na.rm = TRUE)
                ), 
              by = "TICKER") %>%
    
    # Remove missing value
    filter(!is.na(bm) & !is.na(roe)) %>%
    
    # Standardise variable to ensure they are on same scale
    mutate(
      bm_scaled = scale(bm),
      roe_scaled = scale(roe)
    )
  
  print(head(cluster_data, n = 5))
  
  # Select scaled variables
  output_data <- cluster_data[, c("bm_scaled", "roe_scaled")]
  
  print(head(output_data, n = 5))
  
  # Assign output name
  assign(output_name_cleaned, cluster_data, envir = .GlobalEnv)
  assign(output_name_clustering, output_data, envir = .GlobalEnv)
}

Call function to create dataset

# Entire Period
task4_dataset_transformation(Stock_data_2, "2022-12-31", "task4_entire", "task4_cluster_entire")
## # A tibble: 5 × 3
##   TICKER    bm    roe
##   <chr>  <dbl>  <dbl>
## 1 A      0.266  0.149
## 2 AA     0.927  0.001
## 3 AAC    0.712 -0.006
## 4 AACC   1.13   0.005
## 5 AACQ   0.698 -0.029
## # A tibble: 5 × 11
##   TICKER    bm    roe noncovid_RET COVID_RET noncovid_sharpe_ratio
##   <chr>  <dbl>  <dbl>        <dbl>     <dbl>                 <dbl>
## 1 A      0.266  0.149       0.0671    -0.226                -175. 
## 2 AA     0.927  0.001      -0.157     -0.650                 -82.5
## 3 AAC    0.712 -0.006      NA         NA                      NA  
## 4 AACC   1.13   0.005      NA         NA                      NA  
## 5 AACQ   0.698 -0.029      NA         NA                      NA  
## # ℹ 5 more variables: noncovid_volatility <dbl>, covid_sharpe_ratio <dbl>,
## #   covid_volatility <dbl>, bm_scaled <dbl[,1]>, roe_scaled <dbl[,1]>
## # A tibble: 5 × 2
##   bm_scaled[,1] roe_scaled[,1]
##           <dbl>          <dbl>
## 1        -0.874         0.723 
## 2         0.968         0.169 
## 3         0.369         0.143 
## 4         1.53          0.184 
## 5         0.330         0.0569
# Non-COVID period
task4_dataset_transformation(Stock_data_2, "2019-12-14", "task4_noncovid", "task4_cluster_noncovid")
## # A tibble: 5 × 3
##   TICKER    bm    roe
##   <chr>  <dbl>  <dbl>
## 1 A      0.29   0.133
## 2 AA     0.961  0.004
## 3 AAC    0.712 -0.006
## 4 AACC   1.13   0.005
## 5 AAI    0.714  0.113
## # A tibble: 5 × 11
##   TICKER    bm    roe noncovid_RET COVID_RET noncovid_sharpe_ratio
##   <chr>  <dbl>  <dbl>        <dbl>     <dbl>                 <dbl>
## 1 A      0.29   0.133       0.0671    -0.226                -175. 
## 2 AA     0.961  0.004      -0.157     -0.650                 -82.5
## 3 AAC    0.712 -0.006      NA         NA                      NA  
## 4 AACC   1.13   0.005      NA         NA                      NA  
## 5 AAI    0.714  0.113      NA         NA                      NA  
## # ℹ 5 more variables: noncovid_volatility <dbl>, covid_sharpe_ratio <dbl>,
## #   covid_volatility <dbl>, bm_scaled <dbl[,1]>, roe_scaled <dbl[,1]>
## # A tibble: 5 × 2
##   bm_scaled[,1] roe_scaled[,1]
##           <dbl>          <dbl>
## 1        -0.896         0.597 
## 2         0.946        -0.0776
## 3         0.262        -0.130 
## 4         1.41         -0.0723
## 5         0.268         0.492
# COVID period
task4_dataset_transformation(Stock_data_2, "2020-02-14", "task4_covid", "task4_cluster_covid")
## # A tibble: 5 × 3
##   TICKER    bm    roe
##   <chr>  <dbl>  <dbl>
## 1 A      0.29   0.133
## 2 AA     0.961  0.001
## 3 AAC    0.712 -0.006
## 4 AACC   1.13   0.005
## 5 AAI    0.714  0.113
## # A tibble: 5 × 11
##   TICKER    bm    roe noncovid_RET COVID_RET noncovid_sharpe_ratio
##   <chr>  <dbl>  <dbl>        <dbl>     <dbl>                 <dbl>
## 1 A      0.29   0.133       0.0671    -0.226                -175. 
## 2 AA     0.961  0.001      -0.157     -0.650                 -82.5
## 3 AAC    0.712 -0.006      NA         NA                      NA  
## 4 AACC   1.13   0.005      NA         NA                      NA  
## 5 AAI    0.714  0.113      NA         NA                      NA  
## # ℹ 5 more variables: noncovid_volatility <dbl>, covid_sharpe_ratio <dbl>,
## #   covid_volatility <dbl>, bm_scaled <dbl[,1]>, roe_scaled <dbl[,1]>
## # A tibble: 5 × 2
##   bm_scaled[,1] roe_scaled[,1]
##           <dbl>          <dbl>
## 1        -0.896         0.598 
## 2         0.945        -0.0896
## 3         0.262        -0.126 
## 4         1.41         -0.0687
## 5         0.267         0.494

Create function to generate Elbow plot

generate_elbow_plot <- function(find_k_data, elbow_plot_name) {
  inertia_cleaned <- numeric(10)
  
  for (n in 1:10) {
    k_mean_cleaned <- tryCatch({
      kmeans(find_k_data, centers = n, nstart = 10)
    }, error = function(e) {
      return(NULL)
    })
    # Debugging
    if (!is.null(k_mean_cleaned)) {
      inertia_cleaned[n] <- k_mean_cleaned$tot.withinss
      cat(sprintf("Cluster: %d, Inertia: %f\n", n, inertia_cleaned[n]))
    } else {
      cat(sprintf("An error occurred at n_clusters=%d\n", n))
    }
  }
  # Debugging
  if (length(inertia_cleaned) == 10) {
    elbow_data_cleaned <- data.frame(
      clusters_cleaned = 1:10,
      inertia_cleaned = inertia_cleaned
    )
    
    # Generate the plot
    elbow_plot <- ggplot(elbow_data_cleaned, 
                         aes(x = clusters_cleaned, 
                             y = inertia_cleaned)) +
      
      geom_line(color = "blue", size = 1) +  
      
      geom_point(color = "red", 
                 size = 4, 
                 shape = 19) +  
      
      scale_x_continuous(breaks = 1:10) + 
      
      ggtitle('Elbow Method for Clustering') + 
      
      xlab('Number of Clusters') +
      
      ylab('Inertia') +
      
      theme_light() + 
      
      theme(
        plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
        axis.title = element_text(size = 14),
        axis.text = element_text(size = 12)
      )
    
    print(elbow_plot)
    
    # Save the plot for report
    save_plot(elbow_plot, elbow_plot_name)
    
  } else {
    cat("There is some error.")
  }
}

Create function to generate cluster visualisation

generate_cluster_visualisation <- function(clustering_data, cluster_data, cluster_plot_name, period_for_title, centers, output_name) {
  
  set.seed(42)
  
  k_mean_cleaned <- kmeans(clustering_data, centers = centers, nstart = 10)
  
  # Assign cluster to each stock
  cluster_data$cluster <- factor(k_mean_cleaned$cluster)
  
  # Convert centroids to data frame
  centroids_cleaned <- as.data.frame(k_mean_cleaned$centers)
  names(centroids_cleaned) <- c("bm_scaled", "roe_scaled")
  
  # Plot the scatterplot with specified clusters
  cluster_visual <- ggplot(cluster_data, 
                           aes(x = bm_scaled, 
                               y = roe_scaled, 
                               color = cluster)) +
    
    geom_point(size = 3) +
    
    geom_point(data = centroids_cleaned, 
               aes(x = bm_scaled, 
                   y = roe_scaled, 
                   shape = "Centroid"),
               color = 'red', 
               size = 6, 
               alpha = 1) +
    
    scale_shape_manual(name = "", 
                       values = c("Centroid" = 8)) + 
    
    labs(title = 'Cluster Visualisation: Market-to-Book ratio VS Return on Equity',
         subtitle = period_for_title,
         x = 'Standardised Book / Market ratio (bm)',
         y = 'Standardised Return on Equity (roe)',
         color = 'Cluster') +
    
    theme(plot.title = element_text(size = 16, 
                                    face = "bold"),
          plot.subtitle = element_text(size = 12)) +
    
    scale_color_brewer(palette = "Pastel2") +
    
    theme_minimal() +
    
    theme(panel.grid.major = element_line(color = "gray90", 
                                          size = 0.5),
          axis.title = element_text(size = 14),
          axis.text = element_text(size = 12),
          plot.title = element_text(size = 16, 
                                    face = "bold"),
          plot.subtitle = element_text(size = 12))
  
  print(cluster_visual)
  
  # Save plot for report
  save_plot(cluster_visual, cluster_plot_name)
  
  # Assign output name
  assign(output_name, cluster_data, envir = .GlobalEnv)
}

Call function to generate elbow plot and cluster visualisation

# Entire Period
generate_elbow_plot(task4_cluster_entire, "Task 4 - Elbow Plot (Entire)")
## Cluster: 1, Inertia: 12512.000000
## Cluster: 2, Inertia: 7642.593033
## Cluster: 3, Inertia: 4098.499759
## Cluster: 4, Inertia: 3178.173036
## Cluster: 5, Inertia: 2560.094173
## Cluster: 6, Inertia: 2164.649512
## Cluster: 7, Inertia: 1819.718568
## Cluster: 8, Inertia: 1594.905004
## Cluster: 9, Inertia: 1431.724197
## Cluster: 10, Inertia: 1274.023383

generate_cluster_visualisation(task4_cluster_entire, task4_entire, "Task 4 Clustering (Entire)", "Aggregate from Entire Period", 3, "task4_entire")

# Non-COVID period
generate_elbow_plot(task4_cluster_noncovid, "Task 4 - Elbow Plot (Non-COVID)")
## Cluster: 1, Inertia: 9994.000000
## Cluster: 2, Inertia: 6534.440559
## Cluster: 3, Inertia: 3474.260447
## Cluster: 4, Inertia: 2633.705462
## Cluster: 5, Inertia: 2146.807255
## Cluster: 6, Inertia: 1791.804979
## Cluster: 7, Inertia: 1501.036253
## Cluster: 8, Inertia: 1305.638966
## Cluster: 9, Inertia: 1171.676067
## Cluster: 10, Inertia: 1058.081745

generate_cluster_visualisation(task4_cluster_noncovid, task4_noncovid, "Task 4 Clustering (Non-COVID)","Aggregate from Non-COVID Period", 3, "task4_noncovid")

# COVID period
generate_elbow_plot(task4_cluster_covid, "Task 4 - Elbow Plot (COVID)")
## Cluster: 1, Inertia: 10024.000000
## Cluster: 2, Inertia: 6542.387531
## Cluster: 3, Inertia: 3475.782907
## Cluster: 4, Inertia: 2637.769299
## Cluster: 5, Inertia: 2156.451101
## Cluster: 6, Inertia: 1800.543680
## Cluster: 7, Inertia: 1501.480959
## Cluster: 8, Inertia: 1303.700814
## Cluster: 9, Inertia: 1175.419993
## Cluster: 10, Inertia: 1062.107336

generate_cluster_visualisation(task4_cluster_covid, task4_covid, "Task 4 Clustering (COVID)","Aggregate from COVID Period", 3, "task4_covid")

Cluster Characteristics

# Entire Period
Cluster_characteristics_entire <- task4_entire %>% 
  group_by(cluster) %>% 
  summarise(mean_bm = mean(bm, na.rm = TRUE),
            mean_roe = mean(roe, na.rm = TRUE))

print(Cluster_characteristics_entire)
## # A tibble: 3 × 3
##   cluster mean_bm mean_roe
##   <fct>     <dbl>    <dbl>
## 1 1         0.367  0.104  
## 2 2         0.349 -0.491  
## 3 3         0.969  0.00699
# Non-COVID Period
Cluster_characteristics_noncovid <- task4_noncovid %>% 
  group_by(cluster) %>% 
  summarise(mean_bm = mean(bm, na.rm = TRUE),
            mean_roe = mean(roe, na.rm = TRUE))

print(Cluster_characteristics_noncovid)
## # A tibble: 3 × 3
##   cluster mean_bm mean_roe
##   <fct>     <dbl>    <dbl>
## 1 1         0.387  -0.334 
## 2 2         0.997   0.0231
## 3 3         0.394   0.126
# COVID period
Cluster_characteristics_covid <- task4_covid %>% 
  group_by(cluster) %>% 
  summarise(mean_bm = mean(bm, na.rm = TRUE),
            mean_roe = mean(roe, na.rm = TRUE))

print(Cluster_characteristics_covid)
## # A tibble: 3 × 3
##   cluster mean_bm mean_roe
##   <fct>     <dbl>    <dbl>
## 1 1         0.998   0.0233
## 2 2         0.394   0.126 
## 3 3         0.386  -0.333

Average return by cluster for each period

# Create function to calculate average return by cluster for each period
calculate_returns_cluster <- function(cluster_data, output_prefix) {
  output_covid <- cluster_data %>%
  filter(!is.na(COVID_RET) & !is.na(noncovid_RET)) %>% 
  group_by(cluster) %>%
  summarise(cluster_RET_covid = mean(COVID_RET, na.rm = TRUE))
  
  output_noncovid <- cluster_data %>%
  filter(!is.na(COVID_RET) & !is.na(noncovid_RET)) %>% 
  group_by(cluster) %>%
  summarise(cluster_RET_noncovid = mean(noncovid_RET, na.rm = TRUE))
  
  # Print the result
  print(output_noncovid)
  print(output_covid)

}

# Call function
calculate_returns_cluster(task4_entire, "entire")
## # A tibble: 3 × 2
##   cluster cluster_RET_noncovid
##   <fct>                  <dbl>
## 1 1                     0.0403
## 2 2                     0.334 
## 3 3                     0.0606
## # A tibble: 3 × 2
##   cluster cluster_RET_covid
##   <fct>               <dbl>
## 1 1                  -0.380
## 2 2                  -0.304
## 3 3                  -0.375
calculate_returns_cluster(task4_noncovid, "non-covid")
## # A tibble: 3 × 2
##   cluster cluster_RET_noncovid
##   <fct>                  <dbl>
## 1 1                     0.186 
## 2 2                     0.0739
## 3 3                     0.0336
## # A tibble: 3 × 2
##   cluster cluster_RET_covid
##   <fct>               <dbl>
## 1 1                  -0.334
## 2 2                  -0.381
## 3 3                  -0.383
calculate_returns_cluster(task4_covid, "covid")
## # A tibble: 3 × 2
##   cluster cluster_RET_noncovid
##   <fct>                  <dbl>
## 1 1                     0.0729
## 2 2                     0.0334
## 3 3                     0.184 
## # A tibble: 3 × 2
##   cluster cluster_RET_covid
##   <fct>               <dbl>
## 1 1                  -0.380
## 2 2                  -0.382
## 3 3                  -0.342

Investment implications

# Function to normalise metrics and calculate composite scores
calculate_composite_scores <- function(data, selected_cluster, output_prefix) {
  
  # Normalisation for Non-COVID period
  normalised_data_noncovid <- data %>%
    filter(cluster == selected_cluster) %>%
    filter(!if_any(starts_with("noncovid_"), is.na)) %>%
    mutate(
      noncovid_RET_normalised = (noncovid_RET - min(noncovid_RET)) / (max(noncovid_RET) - min(noncovid_RET)),
      noncovid_volatility_normalised = 1 - ((noncovid_volatility - min(noncovid_volatility)) / (max(noncovid_volatility) - min(noncovid_volatility))),
      noncovid_sharpe_ratio_normalised = (noncovid_sharpe_ratio - min(noncovid_sharpe_ratio)) / (max(noncovid_sharpe_ratio) - min(noncovid_sharpe_ratio))
    )
  
  # Calculate scores based on different strategies for Non-COVID
  strategies_noncovid <- list(
    max_return_noncovid = list(weights = c(0.5, 0.3, 0.2), 
                               columns = c("noncovid_RET_normalised", 
                                           "noncovid_sharpe_ratio_normalised", 
                                           "noncovid_volatility_normalised")),
    risk_adjusted_noncovid = list(weights = c(0.3, 0.5, 0.2), 
                                  columns = c("noncovid_RET_normalised", 
                                              "noncovid_sharpe_ratio_normalised", 
                                              "noncovid_volatility_normalised")),
    low_risk_noncovid = list(weights = c(0.2, 0.3, 0.5), 
                             columns = c("noncovid_RET_normalised", 
                                         "noncovid_sharpe_ratio_normalised", 
                                         "noncovid_volatility_normalised"))
  )
  
  results_noncovid <- list()
  
  for (strategy in names(strategies_noncovid)) {
    weight_vector <- strategies_noncovid[[strategy]]$weights
    score_columns <- strategies_noncovid[[strategy]]$columns
    
    results_noncovid[[strategy]] <- normalised_data_noncovid %>%
      mutate(
        composite_score = rowSums(sweep(select(., all_of(score_columns)), 2, weight_vector, `*`))
      ) %>%
      arrange(desc(composite_score)) %>%
      select(TICKER, noncovid_RET_normalised, noncovid_volatility_normalised, noncovid_sharpe_ratio_normalised, composite_score)
    
    # Print the top 3 securities
    cat(paste("\nTop 3 for", strategy, ":\n"))
    print(head(results_noncovid[[strategy]], 3))
  }
  
  # Normalisation for COVID period
  normalised_data_covid <- data %>%
    filter(cluster == selected_cluster) %>%
    filter(!is.na(COVID_RET) & !if_any(starts_with("covid_"), is.na)) %>%
    mutate(
      covid_RET_normalised = (COVID_RET - min(COVID_RET)) / (max(COVID_RET) - min(COVID_RET)),
      covid_volatility_normalised = 1 - ((covid_volatility - min(covid_volatility)) / (max(covid_volatility) - min(covid_volatility))),
      covid_sharpe_ratio_normalised = (covid_sharpe_ratio - min(covid_sharpe_ratio)) / (max(covid_sharpe_ratio) - min(covid_sharpe_ratio))
    )
  
  # Calculate scores based on different strategies for COVID
  strategies_covid <- list(
    max_return_covid = list(weights = c(0.5, 0.3, 0.2), 
                            columns = c("covid_RET_normalised", 
                                        "covid_sharpe_ratio_normalised", 
                                        "covid_volatility_normalised")),
    risk_adjusted_covid = list(weights = c(0.3, 0.5, 0.2), 
                               columns = c("covid_RET_normalised", 
                                           "covid_sharpe_ratio_normalised", 
                                           "covid_volatility_normalised")),
    low_risk_covid = list(weights = c(0.2, 0.3, 0.5), 
                          columns = c("covid_RET_normalised", 
                                      "covid_sharpe_ratio_normalised", 
                                      "covid_volatility_normalised"))
  )
  
  results_covid <- list()
  
  for (strategy in names(strategies_covid)) {
    weight_vector <- strategies_covid[[strategy]]$weights
    score_columns <- strategies_covid[[strategy]]$columns
    
    results_covid[[strategy]] <- normalised_data_covid %>%
      mutate(
        composite_score = rowSums(sweep(select(., all_of(score_columns)), 2, weight_vector, `*`))
      ) %>%
      arrange(desc(composite_score)) %>%
      select(TICKER, covid_RET_normalised, covid_volatility_normalised, covid_sharpe_ratio_normalised, composite_score)
    
    # Print the top 3 securities
    cat(paste("\nTop 3 for", strategy, ":\n"))
    print(head(results_covid[[strategy]], 3))
  }
}

# Call function
calculate_composite_scores(task4_entire, 3, "entire")
## 
## Top 3 for max_return_noncovid :
## # A tibble: 3 × 5
##   TICKER noncovid_RET_normalised noncovid_volatility_no…¹ noncovid_sharpe_rati…²
##   <chr>                    <dbl>                    <dbl>                  <dbl>
## 1 IDXG                     1                        0.897                  0.994
## 2 USEG                     0.825                    0.886                  0.994
## 3 ASNA                     0.726                    0.887                  0.995
## # ℹ abbreviated names: ¹​noncovid_volatility_normalised,
## #   ²​noncovid_sharpe_ratio_normalised
## # ℹ 1 more variable: composite_score <dbl>
## 
## Top 3 for risk_adjusted_noncovid :
## # A tibble: 3 × 5
##   TICKER noncovid_RET_normalised noncovid_volatility_no…¹ noncovid_sharpe_rati…²
##   <chr>                    <dbl>                    <dbl>                  <dbl>
## 1 IDXG                     1                        0.897                  0.994
## 2 USEG                     0.825                    0.886                  0.994
## 3 ASNA                     0.726                    0.887                  0.995
## # ℹ abbreviated names: ¹​noncovid_volatility_normalised,
## #   ²​noncovid_sharpe_ratio_normalised
## # ℹ 1 more variable: composite_score <dbl>
## 
## Top 3 for low_risk_noncovid :
## # A tibble: 3 × 5
##   TICKER noncovid_RET_normalised noncovid_volatility_no…¹ noncovid_sharpe_rati…²
##   <chr>                    <dbl>                    <dbl>                  <dbl>
## 1 IDXG                     1                        0.897                  0.994
## 2 USEG                     0.825                    0.886                  0.994
## 3 ASNA                     0.726                    0.887                  0.995
## # ℹ abbreviated names: ¹​noncovid_volatility_normalised,
## #   ²​noncovid_sharpe_ratio_normalised
## # ℹ 1 more variable: composite_score <dbl>
## 
## Top 3 for max_return_covid :
## # A tibble: 3 × 5
##   TICKER covid_RET_normalised covid_volatility_normalised covid_sharpe_ratio_n…¹
##   <chr>                 <dbl>                       <dbl>                  <dbl>
## 1 AHPI                  1                          0.0210                  0.999
## 2 PLAG                  0.206                      0.847                   0.994
## 3 FTR                   0.177                      0.881                   0.989
## # ℹ abbreviated name: ¹​covid_sharpe_ratio_normalised
## # ℹ 1 more variable: composite_score <dbl>
## 
## Top 3 for risk_adjusted_covid :
## # A tibble: 3 × 5
##   TICKER covid_RET_normalised covid_volatility_normalised covid_sharpe_ratio_n…¹
##   <chr>                 <dbl>                       <dbl>                  <dbl>
## 1 AHPI                  1                          0.0210                  0.999
## 2 PLAG                  0.206                      0.847                   0.994
## 3 FTR                   0.177                      0.881                   0.989
## # ℹ abbreviated name: ¹​covid_sharpe_ratio_normalised
## # ℹ 1 more variable: composite_score <dbl>
## 
## Top 3 for low_risk_covid :
## # A tibble: 3 × 5
##   TICKER covid_RET_normalised covid_volatility_normalised covid_sharpe_ratio_n…¹
##   <chr>                 <dbl>                       <dbl>                  <dbl>
## 1 CIIC                 0.104                        0.997                  0.969
## 2 HCCO                 0.0987                       0.981                  0.988
## 3 AMHC                 0.104                        0.985                  0.968
## # ℹ abbreviated name: ¹​covid_sharpe_ratio_normalised
## # ℹ 1 more variable: composite_score <dbl>
calculate_composite_scores(task4_noncovid, 3, "noncovid")
## 
## Top 3 for max_return_noncovid :
## # A tibble: 3 × 5
##   TICKER noncovid_RET_normalised noncovid_volatility_no…¹ noncovid_sharpe_rati…²
##   <chr>                    <dbl>                    <dbl>                  <dbl>
## 1 PECK                     1                        0.757                  0.990
## 2 NLS                      0.917                    0.798                  0.988
## 3 LMB                      0.621                    0.847                  0.980
## # ℹ abbreviated names: ¹​noncovid_volatility_normalised,
## #   ²​noncovid_sharpe_ratio_normalised
## # ℹ 1 more variable: composite_score <dbl>
## 
## Top 3 for risk_adjusted_noncovid :
## # A tibble: 3 × 5
##   TICKER noncovid_RET_normalised noncovid_volatility_no…¹ noncovid_sharpe_rati…²
##   <chr>                    <dbl>                    <dbl>                  <dbl>
## 1 PECK                     1                        0.757                  0.990
## 2 NLS                      0.917                    0.798                  0.988
## 3 LMB                      0.621                    0.847                  0.980
## # ℹ abbreviated names: ¹​noncovid_volatility_normalised,
## #   ²​noncovid_sharpe_ratio_normalised
## # ℹ 1 more variable: composite_score <dbl>
## 
## Top 3 for low_risk_noncovid :
## # A tibble: 3 × 5
##   TICKER noncovid_RET_normalised noncovid_volatility_no…¹ noncovid_sharpe_rati…²
##   <chr>                    <dbl>                    <dbl>                  <dbl>
## 1 NLS                      0.917                    0.798                  0.988
## 2 PECK                     1                        0.757                  0.990
## 3 FPH                      0.538                    0.932                  0.958
## # ℹ abbreviated names: ¹​noncovid_volatility_normalised,
## #   ²​noncovid_sharpe_ratio_normalised
## # ℹ 1 more variable: composite_score <dbl>
## 
## Top 3 for max_return_covid :
## # A tibble: 3 × 5
##   TICKER covid_RET_normalised covid_volatility_normalised covid_sharpe_ratio_n…¹
##   <chr>                 <dbl>                       <dbl>                  <dbl>
## 1 AIM                   1                           0.228                  0.997
## 2 UNFI                  0.702                       0.818                  0.987
## 3 ZM                    0.675                       0.872                  0.985
## # ℹ abbreviated name: ¹​covid_sharpe_ratio_normalised
## # ℹ 1 more variable: composite_score <dbl>
## 
## Top 3 for risk_adjusted_covid :
## # A tibble: 3 × 5
##   TICKER covid_RET_normalised covid_volatility_normalised covid_sharpe_ratio_n…¹
##   <chr>                 <dbl>                       <dbl>                  <dbl>
## 1 ZM                    0.675                       0.872                  0.985
## 2 UNFI                  0.702                       0.818                  0.987
## 3 EQT                   0.661                       0.826                  0.987
## # ℹ abbreviated name: ¹​covid_sharpe_ratio_normalised
## # ℹ 1 more variable: composite_score <dbl>
## 
## Top 3 for low_risk_covid :
## # A tibble: 3 × 5
##   TICKER covid_RET_normalised covid_volatility_normalised covid_sharpe_ratio_n…¹
##   <chr>                 <dbl>                       <dbl>                  <dbl>
## 1 ZM                    0.675                       0.872                  0.985
## 2 VIRT                  0.551                       0.919                  0.975
## 3 KR                    0.515                       0.926                  0.965
## # ℹ abbreviated name: ¹​covid_sharpe_ratio_normalised
## # ℹ 1 more variable: composite_score <dbl>
calculate_composite_scores(task4_covid, 2, "covid")
## 
## Top 3 for max_return_noncovid :
## # A tibble: 3 × 5
##   TICKER noncovid_RET_normalised noncovid_volatility_no…¹ noncovid_sharpe_rati…²
##   <chr>                    <dbl>                    <dbl>                  <dbl>
## 1 NLS                      1                        0.798                  0.988
## 2 LMB                      0.677                    0.847                  0.980
## 3 AIM                      0.636                    0.826                  0.985
## # ℹ abbreviated names: ¹​noncovid_volatility_normalised,
## #   ²​noncovid_sharpe_ratio_normalised
## # ℹ 1 more variable: composite_score <dbl>
## 
## Top 3 for risk_adjusted_noncovid :
## # A tibble: 3 × 5
##   TICKER noncovid_RET_normalised noncovid_volatility_no…¹ noncovid_sharpe_rati…²
##   <chr>                    <dbl>                    <dbl>                  <dbl>
## 1 NLS                      1                        0.798                  0.988
## 2 LMB                      0.677                    0.847                  0.980
## 3 AIM                      0.636                    0.826                  0.985
## # ℹ abbreviated names: ¹​noncovid_volatility_normalised,
## #   ²​noncovid_sharpe_ratio_normalised
## # ℹ 1 more variable: composite_score <dbl>
## 
## Top 3 for low_risk_noncovid :
## # A tibble: 3 × 5
##   TICKER noncovid_RET_normalised noncovid_volatility_no…¹ noncovid_sharpe_rati…²
##   <chr>                    <dbl>                    <dbl>                  <dbl>
## 1 NLS                      1                        0.798                  0.988
## 2 FPH                      0.586                    0.932                  0.958
## 3 VICR                     0.493                    0.949                  0.946
## # ℹ abbreviated names: ¹​noncovid_volatility_normalised,
## #   ²​noncovid_sharpe_ratio_normalised
## # ℹ 1 more variable: composite_score <dbl>
## 
## Top 3 for max_return_covid :
## # A tibble: 3 × 5
##   TICKER covid_RET_normalised covid_volatility_normalised covid_sharpe_ratio_n…¹
##   <chr>                 <dbl>                       <dbl>                  <dbl>
## 1 AIM                   1                           0.228                  0.997
## 2 UNFI                  0.704                       0.818                  0.987
## 3 ZM                    0.677                       0.872                  0.985
## # ℹ abbreviated name: ¹​covid_sharpe_ratio_normalised
## # ℹ 1 more variable: composite_score <dbl>
## 
## Top 3 for risk_adjusted_covid :
## # A tibble: 3 × 5
##   TICKER covid_RET_normalised covid_volatility_normalised covid_sharpe_ratio_n…¹
##   <chr>                 <dbl>                       <dbl>                  <dbl>
## 1 ZM                    0.677                       0.872                  0.985
## 2 UNFI                  0.704                       0.818                  0.987
## 3 EQT                   0.663                       0.826                  0.987
## # ℹ abbreviated name: ¹​covid_sharpe_ratio_normalised
## # ℹ 1 more variable: composite_score <dbl>
## 
## Top 3 for low_risk_covid :
## # A tibble: 3 × 5
##   TICKER covid_RET_normalised covid_volatility_normalised covid_sharpe_ratio_n…¹
##   <chr>                 <dbl>                       <dbl>                  <dbl>
## 1 ZM                    0.677                       0.872                  0.985
## 2 VIRT                  0.554                       0.919                  0.975
## 3 KR                    0.518                       0.926                  0.965
## # ℹ abbreviated name: ¹​covid_sharpe_ratio_normalised
## # ℹ 1 more variable: composite_score <dbl>

5 Principal Component Analysis

Data Preparation

# Import data
Stock_data_3 <- read.csv("Stock_data_part3.csv")
sp500 <- read.csv("S&P500.csv")
long_term_rate <- read.csv("long-term-rates-2000-2023.csv")

# Make sure all the "date" are converted to ISO 8601 format
Stock_data_3$date <- as.Date(Stock_data_3$date, format = "%Y-%m-%d")
long_term_rate$date <- as.Date(long_term_rate$date, format = "%Y-%m-%d")

# Inspect data
str(Stock_data_3)
## 'data.frame':    1176728 obs. of  12 variables:
##  $ PERMNO: int  10001 10001 10001 10001 10001 10001 10001 10001 10001 10001 ...
##  $ date  : Date, format: "2010-01-29" "2010-02-26" ...
##  $ SHRCD : int  11 11 11 11 11 11 11 11 11 11 ...
##  $ TICKER: chr  "EGAS" "EGAS" "EGAS" "EGAS" ...
##  $ PERMCO: int  7953 7953 7953 7953 7953 7953 7953 7953 7953 7953 ...
##  $ CUSIP : chr  "36720410" "36720410" "36720410" "36720410" ...
##  $ HSICIG: int  NA NA NA NA NA NA NA NA NA NA ...
##  $ PRC   : num  10.1 10 10.2 11.4 11.4 ...
##  $ VOL   : int  3104 1510 2283 3350 3451 3537 2858 2595 1591 1803 ...
##  $ RET   : chr  "-0.018932" "-0.000656" "0.020643" "0.124385" ...
##  $ SHROUT: int  4361 4361 4361 6070 6071 6080 6080 6073 6073 6074 ...
##  $ SPREAD: num  NA NA NA NA NA NA NA NA NA NA ...
head(Stock_data_3, n = 5)
##   PERMNO       date SHRCD TICKER PERMCO    CUSIP HSICIG     PRC  VOL       RET
## 1  10001 2010-01-29    11   EGAS   7953 36720410     NA 10.0600 3104 -0.018932
## 2  10001 2010-02-26    11   EGAS   7953 36720410     NA 10.0084 1510 -0.000656
## 3  10001 2010-03-31    11   EGAS   7953 36720410     NA 10.1700 2283  0.020643
## 4  10001 2010-04-30    11   EGAS   7953 36720410     NA 11.3900 3350  0.124385
## 5  10001 2010-05-28    11   EGAS   7953 36720410     NA 11.4000 3451  0.004829
##   SHROUT SPREAD
## 1   4361     NA
## 2   4361     NA
## 3   4361     NA
## 4   6070     NA
## 5   6071     NA
head(sp500, n = 5)
##   Symbol                          Name                 Sector
## 1    MMM                        3M Co.            Industrials
## 2    ACE                   ACE Limited             Financials
## 3    ABT           Abbott Laboratories            Health Care
## 4    ANF Abercrombie & Fitch Company A Consumer Discretionary
## 5    ACN                     Accenture Information Technology
# Data wrangling
sp500 <- sp500 %>%
  mutate(Symbol = case_when(
    Symbol == "BRK.B" ~ "BRK",
    Symbol == "BF.B" ~ "BF",
    TRUE ~ Symbol
  ))
# Data cleaning: inspect missing value and extract month for later dataset join
task5_dataset1 <- Stock_data_3 %>%
  mutate(
    PRC = abs(PRC),
    PRC = ifelse(PRC == 0.0, NA, PRC),
    VOL = ifelse(VOL == -99, NA, VOL),
    RET = case_when(
      RET %in% c(-44, -55, -66, -77, -88, -99) ~ NA,  
      RET %in% c(".A", ".B", ".C", ".D") ~ NA,        
      TRUE ~ as.numeric(as.character(RET))
    ),
    SPREAD = ifelse(SPREAD <= 0, NA, SPREAD),
    month_year = as.yearmon(ymd(date))
  ) %>%
  
  # Remove duplicate
  distinct(PERMNO, date, .keep_all = TRUE) %>%
  mutate(
    dollar_vol = PRC * VOL
  ) %>%
  # Retain only S&P500 stock
  filter(TICKER %in% sp500$Symbol)

# Create month variable for left join and retain only specific date value, then convert to monthly data
long_term_rate <- long_term_rate %>%
  mutate(
    month_year = as.yearmon(ymd(date))
  ) %>%
  filter(month_year >= "Jan 2010" & month_year <= "Dec 2022") %>% 
  group_by(month_year) %>%
  summarise(
    long_term_rate = mean(long_term_rate, na.rm = TRUE)
  )

# Same process as long term rate
short_term_rate <- risk_free_rate %>%
  mutate(
    month_year = as.yearmon(ymd(date))
  ) %>% 
  rename(short_term_rate = risk_free_rate) %>% 
  filter(month_year >= "Jan 2010" & month_year <= "Dec 2022") %>% 
  group_by(month_year) %>% 
  summarise(
    short_term_rate = mean(short_term_rate, na.rm = TRUE)
  )

# Similar process as previous
FFF_monthly <- fama_french_factors %>%
  mutate(
    month_year = as.yearmon(ymd(date))
  ) %>% 
  filter(month_year >= "Jan 2010" & month_year <= "Dec 2022") %>% 
  group_by(month_year) %>% 
  summarise(
    Mkt = mean(Mkt.RF, na.rm = TRUE),
    SMB = mean(SMB, na.rm = TRUE),
    HML = mean(HML, na.rm = TRUE)
  )

# Process the financial ratio dataset: divyield have a lot of missing value, better not to include it
financial_ratio <- Stock_data_2 %>%
      filter(TICKER != "") %>%
      mutate(
        month_year = as.yearmon(ymd(public_date))) %>%
      select(-divyield, -public_date)

# Join the financial ratio dataset 
task5_dataset2 <- task5_dataset1 %>% 
  left_join(
    financial_ratio, 
    by = c("month_year", "TICKER")
  ) 

# Join all the other datasets
task5_dataset3 <- task5_dataset2 %>% 
  left_join(
    long_term_rate,
    by = "month_year"
  ) %>% 
  left_join(
    short_term_rate, 
    by = "month_year"
  ) %>% 
  left_join(
    FFF_monthly, 
    by = "month_year"
  ) 

# Inspect the latest dataset
head(task5_dataset3, n = 5)
##   PERMNO       date SHRCD TICKER PERMCO    CUSIP HSICIG     PRC     VOL
## 1  10104 2010-01-29    11   ORCL   8045 68389X10     NA 23.0600 6068156
## 2  10104 2010-02-26    11   ORCL   8045 68389X10     NA 24.6500 5771538
## 3  10104 2010-03-31    11   ORCL   8045 68389X10     NA 25.7100 6618577
## 4  10104 2010-04-30    11   ORCL   8045 68389X10     NA 25.8675 5580407
## 5  10104 2010-05-28    11   ORCL   8045 68389X10     NA 22.5700 7406752
##         RET  SHROUT SPREAD month_year dollar_vol  CAPEI    bm    evm pe_exi
## 1 -0.057888 5011220     NA   Jan 2010  139931677 26.386 0.252 10.110 20.052
## 2  0.068951 5015000     NA   Feb 2010  142268412 28.226 0.252 10.110 21.435
## 3  0.043002 5019091     NA   Mar 2010  170163615 29.464 0.252 10.110 22.357
## 4  0.008071 5029523     NA   Apr 2010  144351178 28.783 0.233 10.983 23.096
## 5 -0.127477 5026000     NA   May 2010  167170393 25.096 0.233 10.983 20.152
##     dpr   npm   roa   roe  roce   ptb long_term_rate short_term_rate
## 1 0.129 0.250 0.232 0.232 0.250 4.145       4.318421       0.3231579
## 2 0.129 0.250 0.232 0.232 0.250 4.434       4.305263       0.3263158
## 3 0.129 0.250 0.232 0.232 0.250 4.628       4.332174       0.3726087
## 4 0.177 0.234 0.227 0.218 0.242 4.515       4.389545       0.4190909
## 5 0.177 0.234 0.227 0.218 0.242 3.937       3.990000       0.3520000
##           Mkt        SMB         HML
## 1 -0.17578947 0.02263158  0.02736842
## 2  0.18315789 0.05947368  0.16842105
## 3  0.26652174 0.06086957  0.09260870
## 4  0.09904762 0.22285714  0.13761905
## 5 -0.39000000 0.01450000 -0.11500000
# Transform to time series dataset (after trying different aggregation methods, take the average value may be the best)
task5_dataset4 <- task5_dataset3 %>% 
  group_by(date) %>% 
  summarise(
    dollar_vol = mean(dollar_vol, na.rm = TRUE),
    RET = mean(RET, na.rm = TRUE),
    SHROUT = mean(SHROUT, na.rm = TRUE),
    long_term_rate = mean(long_term_rate, na.rm = TRUE),
    short_term_rate = mean(short_term_rate, na.rm = TRUE),
    Mkt = mean(Mkt, na.rm = TRUE),
    SMB = mean(SMB, na.rm = TRUE),
    HML = mean(HML, na.rm = TRUE),
    CAPEI = mean(CAPEI, na.rm = TRUE),
    bm = mean(bm, na.rm = TRUE),
    evm = mean(evm, na.rm = TRUE),
    pe_exi = mean(pe_exi, na.rm = TRUE),
    dpr = mean(dpr, na.rm = TRUE),
    npm = mean(npm, na.rm = TRUE),
    roa = mean(roa, na.rm = TRUE),
    roe = mean(roe, na.rm = TRUE),
    roce = mean(roce, na.rm = TRUE),
    ptb = mean(ptb, na.rm = TRUE),
    .groups = 'drop'
  ) %>%
  mutate(
    interest_rate_spread = long_term_rate - short_term_rate,
    std_ret = rollapply(RET, width = 30, FUN = sd, fill = NA, align = "right", na.rm = TRUE),
    volatility = std_ret * sqrt(252)
    ) %>% 
  filter(!is.na(volatility))

# Make sure it is a dataframe (otherwise cannot run the PCA)
task5_dataset5 <- as.data.frame(task5_dataset4 %>% select(-c(date, long_term_rate, short_term_rate, Mkt, std_ret)))

# Inspect dataset
summary(task5_dataset4)
##       date              dollar_vol             RET                SHROUT      
##  Min.   :2012-06-29   Min.   : 36847685   Min.   :-0.202143   Min.   :585492  
##  1st Qu.:2015-02-13   1st Qu.: 46833884   1st Qu.:-0.006104   1st Qu.:607794  
##  Median :2017-09-29   Median : 56757620   Median : 0.013068   Median :632938  
##  Mean   :2017-09-29   Mean   : 64129273   Mean   : 0.012131   Mean   :635908  
##  3rd Qu.:2020-05-14   3rd Qu.: 82015779   3rd Qu.: 0.039496   3rd Qu.:649822  
##  Max.   :2022-12-30   Max.   :137978627   Max.   : 0.172828   Max.   :720953  
##  long_term_rate  short_term_rate       Mkt                SMB           
##  Min.   :1.151   Min.   :0.0495   Min.   :-0.50895   Min.   :-0.304500  
##  1st Qu.:2.207   1st Qu.:0.1274   1st Qu.:-0.05159   1st Qu.:-0.084814  
##  Median :2.661   Median :0.4521   Median : 0.07000   Median : 0.011579  
##  Mean   :2.595   Mean   :0.9261   Mean   : 0.05092   Mean   :-0.001827  
##  3rd Qu.:3.009   3rd Qu.:1.5475   3rd Qu.: 0.15394   3rd Qu.: 0.069500  
##  Max.   :4.235   Max.   :4.5120   Max.   : 0.64048   Max.   : 0.375263  
##       HML                CAPEI               bm              evm         
##  Min.   :-0.693182   Min.   :-352.06   Min.   :0.4197   Min.   : 0.5219  
##  1st Qu.:-0.085526   1st Qu.:  12.34   1st Qu.:0.4815   1st Qu.:11.1566  
##  Median :-0.014000   Median :  19.21   Median :0.5039   Median :12.8450  
##  Mean   : 0.002641   Mean   :  15.95   Mean   :0.5342   Mean   :12.5192  
##  3rd Qu.: 0.076450   3rd Qu.:  26.98   3rd Qu.:0.5654   3rd Qu.:13.7939  
##  Max.   : 0.662000   Max.   : 315.59   Max.   :1.0024   Max.   :21.6639  
##      pe_exi           dpr              npm                 roa        
##  Min.   : 6.93   Min.   :0.3953   Min.   :-48.60342   Min.   :0.1045  
##  1st Qu.:16.98   1st Qu.:0.5220   1st Qu.:  0.03955   1st Qu.:0.1257  
##  Median :19.33   Median :0.6409   Median :  0.09532   Median :0.1334  
##  Mean   :19.25   Mean   :0.7103   Mean   : -3.62829   Mean   :0.1328  
##  3rd Qu.:21.13   3rd Qu.:0.9030   3rd Qu.:  0.10499   3rd Qu.:0.1436  
##  Max.   :28.84   Max.   :1.3859   Max.   :  0.11613   Max.   :0.1556  
##       roe               roce             ptb        interest_rate_spread
##  Min.   :0.02441   Min.   :0.1234   Min.   :3.108   Min.   :-0.6405     
##  1st Qu.:0.16965   1st Qu.:0.1598   1st Qu.:4.065   1st Qu.: 1.0142     
##  Median :0.18671   Median :0.1733   Median :4.875   Median : 1.7391     
##  Mean   :0.20100   Mean   :0.1689   Mean   :4.933   Mean   : 1.6691     
##  3rd Qu.:0.25664   3rd Qu.:0.1817   3rd Qu.:5.394   3rd Qu.: 2.2865     
##  Max.   :0.34199   Max.   :0.1979   Max.   :8.156   Max.   : 3.4800     
##     std_ret          volatility    
##  Min.   :0.02154   Min.   :0.3420  
##  1st Qu.:0.03291   1st Qu.:0.5225  
##  Median :0.04180   Median :0.6636  
##  Mean   :0.04542   Mean   :0.7211  
##  3rd Qu.:0.05836   3rd Qu.:0.9264  
##  Max.   :0.07752   Max.   :1.2306
str(task5_dataset5)
## 'data.frame':    127 obs. of  17 variables:
##  $ dollar_vol          : num  43443315 40588141 38140313 39590774 41834684 ...
##  $ RET                 : num  0.03946 0.00286 0.03124 0.02398 -0.00654 ...
##  $ SHROUT              : num  587939 586444 589763 589269 585492 ...
##  $ SMB                 : num  0.0343 -0.1305 0.0226 0.0258 -0.0548 ...
##  $ HML                 : num  0.029048 0.000952 0.053913 0.084737 0.173333 ...
##  $ CAPEI               : num  18 17.8 19 20 18.5 ...
##  $ bm                  : num  0.583 0.583 0.638 0.642 0.64 ...
##  $ evm                 : num  10.6 10.6 10.2 11.1 11.1 ...
##  $ pe_exi              : num  17.8 17.4 17.9 18.4 18 ...
##  $ dpr                 : num  0.422 0.422 0.525 0.526 0.526 ...
##  $ npm                 : num  0.106 0.106 0.102 0.102 0.102 ...
##  $ roa                 : num  0.156 0.156 0.153 0.153 0.153 ...
##  $ roe                 : num  0.167 0.168 0.177 0.177 0.178 ...
##  $ roce                : num  0.198 0.198 0.196 0.196 0.195 ...
##  $ ptb                 : num  3.14 3.11 3.11 3.22 3.14 ...
##  $ interest_rate_spread: num  2.1 2 2.18 2.28 2.29 ...
##  $ volatility          : num  0.868 0.859 0.856 0.839 0.838 ...

Run PCA

# Perform PCA
task5_pca <- prcomp(task5_dataset5 , centre = TRUE, scale. = TRUE)
summary(task5_pca)
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6     PC7
## Standard deviation     2.3080 1.5510 1.23707 1.18621 1.12444 1.04010 0.94410
## Proportion of Variance 0.3134 0.1415 0.09002 0.08277 0.07437 0.06364 0.05243
## Cumulative Proportion  0.3134 0.4549 0.54488 0.62765 0.70203 0.76566 0.81809
##                            PC8     PC9    PC10    PC11    PC12    PC13    PC14
## Standard deviation     0.86015 0.75763 0.69938 0.59553 0.58830 0.50287 0.39443
## Proportion of Variance 0.04352 0.03376 0.02877 0.02086 0.02036 0.01488 0.00915
## Cumulative Proportion  0.86162 0.89538 0.92415 0.94502 0.96537 0.98025 0.98940
##                           PC15    PC16    PC17
## Standard deviation     0.29707 0.24997 0.17160
## Proportion of Variance 0.00519 0.00368 0.00173
## Cumulative Proportion  0.99459 0.99827 1.00000
print(task5_pca$rotation)
##                               PC1         PC2         PC3           PC4
## dollar_vol            0.367885748  0.10777197  0.15766493 -0.0844284455
## RET                   0.049352005 -0.30559848 -0.30389508 -0.3191010818
## SHROUT                0.388944920  0.14001799  0.10668939  0.0007736679
## SMB                   0.026954120 -0.33611941 -0.06459950 -0.1765879614
## HML                   0.063068301 -0.04692741 -0.42853262 -0.2851617295
## CAPEI                -0.039649103  0.24207246 -0.40135237  0.3030777662
## bm                    0.003563024 -0.43146624  0.40517214 -0.0951636288
## evm                   0.036316321  0.35990854 -0.13086026 -0.0769245694
## pe_exi               -0.016350234  0.32239520 -0.14539727 -0.2264548116
## dpr                   0.262791642  0.03833889  0.01307735  0.3006252023
## npm                  -0.280648118  0.03850808  0.21063836  0.2346180930
## roa                  -0.353043386  0.20307363  0.10852398 -0.2631312891
## roe                   0.128955164  0.39303739  0.21901341 -0.3798395423
## roce                 -0.309098020  0.21411480  0.22592900 -0.2925551934
## ptb                   0.404588648  0.04607298 -0.15224663 -0.0069807801
## interest_rate_spread -0.230684572 -0.16019841 -0.34688185 -0.2177245840
## volatility            0.318638364 -0.09836080  0.14576294 -0.3584064968
##                              PC5         PC6         PC7         PC8
## dollar_vol           -0.09722908 -0.08259944  0.12019048 -0.03430862
## RET                  -0.12271489  0.41947292 -0.02501959  0.08124524
## SHROUT               -0.17252443  0.05863392 -0.01007236 -0.12109082
## SMB                  -0.24090224  0.32545129 -0.54025777 -0.33695114
## HML                  -0.17319593  0.13166689  0.68463051 -0.02964720
## CAPEI                -0.44721616 -0.10154497 -0.09716755 -0.18655377
## bm                    0.22003598  0.06225616  0.20856748 -0.15824081
## evm                   0.41002026  0.08500053  0.09282768 -0.69879284
## pe_exi                0.46036278  0.40111848 -0.18365867  0.24274917
## dpr                   0.11277157  0.31787832  0.12452483  0.33018061
## npm                  -0.24104211  0.38481824  0.23383726  0.01295801
## roa                  -0.15798552  0.01410542  0.03881481  0.12438976
## roe                  -0.25337674  0.02070937 -0.15452070  0.22916804
## roce                 -0.13992036  0.02119900  0.04332998 -0.12578038
## ptb                   0.03812125 -0.03840319 -0.12108559  0.12566540
## interest_rate_spread  0.19095208 -0.43527376 -0.09312257  0.20650456
## volatility           -0.10840653 -0.25898228  0.09032970 -0.06803291
##                               PC9         PC10        PC11        PC12
## dollar_vol            0.122574816 -0.126471175  0.10488372 -0.49327446
## RET                  -0.519738841 -0.427967258  0.20908091 -0.07839313
## SHROUT                0.100861012 -0.125656590  0.02720539  0.05305744
## SMB                   0.331819854  0.393232105  0.04005209 -0.03203888
## HML                   0.331017715  0.194500391 -0.12112016  0.21416706
## CAPEI                -0.338750828  0.094398615 -0.50004402 -0.21135589
## bm                   -0.252836458  0.089392493 -0.52062344  0.00667326
## evm                  -0.156021532  0.038062416  0.26337587 -0.05400858
## pe_exi                0.179416238 -0.034213377 -0.46693841 -0.25293551
## dpr                  -0.307155516  0.641865591  0.24427754 -0.07667120
## npm                   0.223769984 -0.135145687  0.10899305 -0.47460250
## roa                  -0.130066686  0.133797439  0.11589223 -0.01392507
## roe                  -0.006950409 -0.001973818 -0.03687582  0.21435281
## roce                 -0.262189975  0.253108030  0.02819838  0.05611067
## ptb                  -0.071769041  0.002810908 -0.01516842  0.13410173
## interest_rate_spread  0.044538823  0.178403574  0.16466222 -0.36738743
## volatility           -0.111912060  0.170338854 -0.07278982 -0.39477123
##                              PC13        PC14         PC15         PC16
## dollar_vol            0.119623359 -0.56373111 -0.284317522  0.276109502
## RET                  -0.006987718  0.03278773 -0.028093495  0.077465259
## SHROUT                0.101353235  0.49698979 -0.282670435 -0.276157658
## SMB                  -0.024502409 -0.11430105 -0.006171665  0.005747384
## HML                  -0.013472365 -0.04056733 -0.036231837  0.028210973
## CAPEI                -0.086074902 -0.04241015 -0.011378007  0.029622328
## bm                   -0.290979570 -0.13337706 -0.167770018 -0.139050456
## evm                  -0.253228040 -0.02768383  0.084217002 -0.060632841
## pe_exi                0.185951256  0.08039388  0.016728465  0.020839337
## dpr                  -0.062933612  0.04470534 -0.002786329  0.113778500
## npm                  -0.262904282  0.15056060 -0.034283540 -0.315634128
## roa                   0.141094377 -0.36517626  0.355580478 -0.401075055
## roe                  -0.637695442  0.01458670 -0.009063363  0.181892751
## roce                  0.404209377  0.13536522 -0.528781095  0.038546932
## ptb                   0.020149905 -0.27876606 -0.167673497 -0.699345685
## interest_rate_spread -0.313049876  0.15079351 -0.344419160 -0.130997417
## volatility            0.149575729  0.33999825  0.497414472 -0.045830166
##                              PC17
## dollar_vol            0.121015045
## RET                   0.026348723
## SHROUT                0.571674588
## SMB                   0.021704151
## HML                  -0.007438289
## CAPEI                 0.061143226
## bm                    0.163769509
## evm                  -0.010006597
## pe_exi                0.027577317
## dpr                   0.099176494
## npm                  -0.247494729
## roa                   0.471130702
## roe                  -0.090716103
## roce                 -0.291033380
## ptb                  -0.398608354
## interest_rate_spread  0.154031470
## volatility           -0.239385411
# Extract eigenvalues and variance explained
eigenvalues <- task5_pca$sdev^2
variance_explained <- eigenvalues / sum(eigenvalues)
cumulative_variance <- cumsum(variance_explained)
components <- 1:length(eigenvalues)
scree_data <- data.frame(Components = components, Eigenvalues = eigenvalues)

# Create screeplot
task5_screeplot <- ggplot(scree_data, aes(x = Components, y = Eigenvalues)) +
  
  geom_line(color = "blue") +
  
  geom_point(color = "blue", size = 2) +
  
  geom_hline(yintercept = 1, linetype = "dashed", color = "red") +
  
  labs(title = "Scree Plot of Principal Components", 
       x = "Principal Components", 
       y = "Proportion of Variance Explained") +
  
  theme_minimal() +
  
  theme(
    axis.title.x = element_text(size = 12),
    axis.title.y = element_text(size = 12),
    title = element_text(size = 14)) +
  
  scale_x_continuous(breaks = components) + 
  
  annotate("text", 
           x = max(components), 
           y = 1, 
           label = "Kaiser Criterion", 
           color = "red", 
           hjust = 1.1, 
           vjust = -1) +
  
  annotate("text", 
           x = 1, 
           y = max(eigenvalues), 
           label = "Eigenvalues", 
           color = "blue", 
           hjust = -0.2)

print(task5_screeplot)

# Save the file for report
save_plot(task5_screeplot, "task5 - scree plot")

# Create the eigenvalues summary table
pca_result <- data.frame(
  Principal_Component = 1:length(eigenvalues),
  Eigenvalue = eigenvalues,
  Variance_Explained = variance_explained,
  Cumulative_Variance = cumulative_variance
)

print(pca_result)
##    Principal_Component Eigenvalue Variance_Explained Cumulative_Variance
## 1                    1 5.32701796        0.313353998           0.3133540
## 2                    2 2.40566351        0.141509618           0.4548636
## 3                    3 1.53034174        0.090020103           0.5448837
## 4                    4 1.40710145        0.082770674           0.6276544
## 5                    5 1.26436039        0.074374141           0.7020285
## 6                    6 1.08180109        0.063635359           0.7656639
## 7                    7 0.89132849        0.052431087           0.8180950
## 8                    8 0.73986263        0.043521331           0.8616163
## 9                    9 0.57400350        0.033764912           0.8953812
## 10                  10 0.48913178        0.028772458           0.9241537
## 11                  11 0.35466017        0.020862363           0.9450160
## 12                  12 0.34609445        0.020358497           0.9653745
## 13                  13 0.25287876        0.014875221           0.9802498
## 14                  14 0.15557532        0.009151489           0.9894013
## 15                  15 0.08824877        0.005191104           0.9945924
## 16                  16 0.06248348        0.003675499           0.9982679
## 17                  17 0.02944649        0.001732147           1.0000000

Run a regression with the PCA result

# Only extract the significant PC to a new dataframe
pca_regression_df <- data.frame(task5_pca$x)[, 1:6]

# Replace the PCA's RET with the original dataset RET (the S&P500 stocks' average returns)
pca_regression_df$RET <- task5_dataset5$RET 

# Add the market factor inside the dataset
pca_regression_df <- pca_regression_df %>%
  bind_cols(Mkt = task5_dataset4$Mkt)

# Run the regression
pca_regression_model <- lm(RET ~ ., data = pca_regression_df)

summary(pca_regression_model)
## 
## Call:
## lm(formula = RET ~ ., data = pca_regression_df)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0266487 -0.0063438  0.0005195  0.0058295  0.0269589 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.0037602  0.0009034   4.162 5.99e-05 ***
## PC1          0.0019018  0.0003706   5.132 1.13e-06 ***
## PC2         -0.0052062  0.0006543  -7.957 1.16e-12 ***
## PC3         -0.0079547  0.0007346 -10.828  < 2e-16 ***
## PC4         -0.0071145  0.0007833  -9.083 2.77e-15 ***
## PC5         -0.0051697  0.0007604  -6.799 4.47e-10 ***
## PC6          0.0082613  0.0009339   8.846 1.00e-14 ***
## Mkt          0.1643814  0.0059577  27.591  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.009589 on 119 degrees of freedom
## Multiple R-squared:  0.9637, Adjusted R-squared:  0.9616 
## F-statistic: 451.6 on 7 and 119 DF,  p-value: < 2.2e-16

Create a plot to show the loadings of each variable

# Prepare the data
pca_loading_factor <- as.data.frame(task5_pca$rotation) %>% 
  select(c(PC1, PC2, PC3, PC4, PC5, PC6))
pca_loading_factor$Variable <- rownames(pca_loading_factor)
pca_loadings_df <- pivot_longer(pca_loading_factor %>% 
  filter(Variable != "RET"), 
                               cols = -Variable, 
                               names_to = "PC", 
                               values_to = "Loading") 

# Generate the plot - can observe in each PC, the variable is positive / negative, and the different factor representation in each PC
ggplot(pca_loadings_df, 
       aes(x = Variable, 
           y = Loading, 
           fill = PC)) +
  
  geom_bar(stat = "identity", 
           position = "dodge") +
  
  labs(title = "Loadings of each variable for significant PCs",
       x = "Variables",
       y = "Loadings")+ 
  
  facet_wrap(~ PC) +
  
  coord_flip() + 
  
  theme(axis.text.y = element_text(angle = 0)) 

6 Difference-in-Difference Analysis

Data Preparation

# Load library
library(lme4)
library(plm)  

# Import data
tick_pilot <- read.csv("Tick_Pilot_Test_Group.csv")
stock_1415 <- read.csv("DailyStocks_2014_2015.csv")
stock_1617 <- read.csv("DailyStocks_2016_2017.csv")
stock_1820 <- read.csv("DailyStocks_2018_2020.csv")

# Make sure all the "date" are in same format
tick_pilot$Effective_Date <- as.Date(tick_pilot$Effective_Date, format = "%Y-%m-%d")
stock_1415$date <- as.Date(stock_1415$date, format = "%Y-%m-%d")
stock_1617$date <- as.Date(stock_1617$date, format = "%Y-%m-%d")
stock_1820$date <- as.Date(stock_1820$date, format = "%Y-%m-%d")

Inspect data

## The column name is the same:  TRUE
## The column name is the same: TRUE
## The column name is the same: TRUE
## Differences in structure between stock_1415 and stock_1820:  NUMTRD
## Differences in structure between stock_1617 and stock_1820:  NUMTRD
## Since we will not use NUMTRD in this analysis, no need to action.

Short-term effect

Create dataset

The program effective date is 2016-10-01, consider the potential for initial volatility, allow the market a month to adjust, the start date of post period is chosen as November 1, 2016, and observe the short term effect within six months. For pre-period, the same dates in the previous year of post-period can better control for seasonal effects and other factors that may influence bid-ask spreads.

# Combine datasets
short_period <- stock_1415 %>%  
  bind_rows(stock_1617) %>%
  filter(date >= "2015-11-01" & date < "2016-10-01" | date >= "2016-11-01" & date <= "2017-01-31") %>% 
  select(PERMNO, date, TICKER, BID, ASK) %>% 
  filter(TICKER %in% tick_pilot$Ticker_Symbol)

# Inspect dataset
head(short_period, n = 5)
##   PERMNO       date TICKER   BID   ASK
## 1  10025 2015-11-02   AEPI 83.76 84.00
## 2  10025 2015-11-03   AEPI 80.13 80.41
## 3  10025 2015-11-04   AEPI 79.62 79.87
## 4  10025 2015-11-05   AEPI 79.35 79.51
## 5  10025 2015-11-06   AEPI 86.00 86.34

Data wrangling

# Clean data
short_period_df1 <- short_period %>%
  mutate(
    # Handle NA values
    ASK = ifelse(ASK - BID < 0, NA, as.numeric(as.character(ASK))),
    BID = ifelse(ASK - BID < 0, NA, as.numeric(as.character(BID)))
  ) %>%
  # Remove duplicate
  distinct(PERMNO, date, .keep_all = TRUE) %>% 
  group_by(PERMNO) %>% 
  # Compute relative bid-ask spread
  mutate(
    bid_ask_spread = (ASK - BID) / ((ASK + BID) / 2)
    ) %>% 
  ungroup() %>% 
  select(date, TICKER, bid_ask_spread) %>% 
  # Define group for each security
  left_join(tick_pilot %>% 
              select(Ticker_Symbol, Tick_Size_Pilot_Program_Group), by = c("TICKER" = "Ticker_Symbol")) %>% 
  rename(Test_group_ = Tick_Size_Pilot_Program_Group) %>% 
  filter(!is.na(bid_ask_spread)) %>% 
  # Define period for pre and post period
  mutate(
    d_after = ifelse(date < "2016-10-01", 0, 1),
    Test_group_ = factor(Test_group_)
  ) 

# Inspect number of stocks for each group (Short period)
print(short_period_df1 %>%
  group_by(Test_group_) %>%
  summarise(Ticker_per_group = n_distinct(TICKER)))
## # A tibble: 4 × 2
##   Test_group_ Ticker_per_group
##   <fct>                  <int>
## 1 C                        959
## 2 G1                       328
## 3 G2                       326
## 4 G3                       314
head(short_period_df1, n = 5)
## # A tibble: 5 × 5
##   date       TICKER bid_ask_spread Test_group_ d_after
##   <date>     <chr>           <dbl> <fct>         <dbl>
## 1 2015-11-02 AEPI          0.00286 G2                0
## 2 2015-11-03 AEPI          0.00349 G2                0
## 3 2015-11-04 AEPI          0.00313 G2                0
## 4 2015-11-05 AEPI          0.00201 G2                0
## 5 2015-11-06 AEPI          0.00395 G2                0

Descriptive Statistics

# Observe the average and standard deviation bid-ask spread value for each group
print(short_period_df1 %>% 
  group_by(Test_group_) %>% 
  summarise(
    mean_spread = mean(bid_ask_spread, na.rm = TRUE), 
    sd_spread = sd(bid_ask_spread, na.rm = TRUE)
  ))
## # A tibble: 4 × 3
##   Test_group_ mean_spread sd_spread
##   <fct>             <dbl>     <dbl>
## 1 C               0.00692    0.0222
## 2 G1              0.00680    0.0200
## 3 G2              0.00754    0.0205
## 4 G3              0.00706    0.0240

DiD regressions

# Run regression with fixed effect
DiD_model_1 <- plm(bid_ask_spread ~ Test_group_ * d_after,
                   data = short_period_df1,
                   index = c("date", "TICKER"),
                   model = "within")
summary(DiD_model_1)
## Oneway (individual) effect Within Model
## 
## Call:
## plm(formula = bid_ask_spread ~ Test_group_ * d_after, data = short_period_df1, 
##     model = "within", index = c("date", "TICKER"))
## 
## Unbalanced Panel: n = 293, T = 1886-1926, N = 559863
## 
## Residuals:
##       Min.    1st Qu.     Median    3rd Qu.       Max. 
## -0.0117571 -0.0062784 -0.0051638 -0.0023067  1.9891584 
## 
## Coefficients:
##                          Estimate  Std. Error t-value  Pr(>|t|)    
## Test_group_G1         -6.4777e-04  9.2381e-05 -7.0120 2.352e-12 ***
## Test_group_G2         -7.1589e-05  9.2209e-05 -0.7764    0.4375    
## Test_group_G3         -4.8118e-04  9.3995e-05 -5.1192 3.069e-07 ***
## Test_group_G1:d_after  2.5343e-03  2.0112e-04 12.6013 < 2.2e-16 ***
## Test_group_G2:d_after  3.3135e-03  2.0120e-04 16.4692 < 2.2e-16 ***
## Test_group_G3:d_after  2.9495e-03  2.0415e-04 14.4478 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    267.31
## Residual Sum of Squares: 267.07
## R-Squared:      0.00089239
## Adj. R-Squared: 0.00036031
## F-statistic: 83.299 on 6 and 559564 DF, p-value: < 2.22e-16
# Determine the impact of fixed effect
DiD_model_1_random <- plm(bid_ask_spread ~ Test_group_ * d_after,
                   data = short_period_df1,
                   index = c("date", "TICKER"),
                   model = "random")

summary(DiD_model_1_random)
## Oneway (individual) effect Random Effect Model 
##    (Swamy-Arora's transformation)
## 
## Call:
## plm(formula = bid_ask_spread ~ Test_group_ * d_after, data = short_period_df1, 
##     model = "random", index = c("date", "TICKER"))
## 
## Unbalanced Panel: n = 293, T = 1886-1926, N = 559863
## 
## Effects:
##                     var   std.dev share
## idiosyncratic 4.773e-04 2.185e-02     1
## individual    5.944e-08 2.438e-04     0
## theta:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1001  0.1009  0.1012  0.1012  0.1017  0.1019 
## 
## Residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.00957 -0.00622 -0.00551  0.00000 -0.00242  1.99047 
## 
## Coefficients:
##                          Estimate  Std. Error  z-value  Pr(>|z|)    
## (Intercept)            7.0968e-03  4.9307e-05 143.9319 < 2.2e-16 ***
## Test_group_G1         -6.4980e-04  9.2423e-05  -7.0306 2.056e-12 ***
## Test_group_G2         -7.0419e-05  9.2251e-05  -0.7633    0.4453    
## Test_group_G3         -4.8404e-04  9.4038e-05  -5.1473 2.643e-07 ***
## d_after               -8.4170e-04  1.0742e-04  -7.8359 4.657e-15 ***
## Test_group_G1:d_after  2.5375e-03  2.0121e-04  12.6111 < 2.2e-16 ***
## Test_group_G2:d_after  3.3125e-03  2.0129e-04  16.4565 < 2.2e-16 ***
## Test_group_G3:d_after  2.9517e-03  2.0425e-04  14.4517 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    267.72
## Residual Sum of Squares: 267.45
## R-Squared:      0.0010111
## Adj. R-Squared: 0.00099864
## Chisq: 563.244 on 7 DF, p-value: < 2.22e-16

The result is quite similar, but not exactly the same, suggesting that fixed effect is somehow influencing, it is better to use the model with fixed effect

Long-term effect

The program effective for two years. Observe the long-term effect:

# To compute volatility for specific period, separate into two dataset first

# The pre-period should be the same as the short-term effect (it is the baseline, should be consistent)
pre_treatment <- stock_1415 %>% 
  bind_rows(stock_1617) %>% 
  filter(date >= "2015-10-01" & date <= "2016-09-30") %>% 
  select(PERMNO, date, TICKER, PRC, VOL, RET, BID, ASK, SHROUT) %>% 
  filter(TICKER %in% tick_pilot$Ticker_Symbol) 

# Using the first effective month to compute the rolling standard deviation, therefore, the post-period actually is from November 1, 2016
post_treatment <- stock_1617 %>% 
  bind_rows(stock_1820) %>% 
  filter(date >= "2016-10-01" & date <= "2018-09-30") %>% 
  select(PERMNO, date, TICKER, PRC, VOL, RET, BID, ASK, SHROUT) %>% 
  filter(TICKER %in% tick_pilot$Ticker_Symbol)

Data wrangling

# Define trading days (for calculate volatility) - typically trading day is 252, this code also make sure the number of observation for the data
trading_days_pre <- as.integer(length(unique(pre_treatment$date)) / 2)
trading_days_post <- as.integer(length(unique(post_treatment$date)) / 2)

# Clean data (pre-period)
pre_treatment_df1 <- pre_treatment %>%
  mutate(
    PERMNO = as.character(PERMNO),
    PRC = abs(PRC),
    # Handle missing values
    PRC = ifelse(PRC == 0.0, NA, PRC),
    VOL = ifelse(VOL == -99, NA, VOL),
    RET = case_when(
      RET %in% c(-44, -55, -66, -77, -88, -99) ~ NA,  
      RET %in% c(".A", ".B", ".C", ".D") ~ NA,        
      TRUE ~ as.numeric(as.character(RET))
    ),
    ASK = ifelse(ASK - BID < 0, NA, as.numeric(as.character(ASK))),
    BID = ifelse(ASK - BID < 0, NA, as.numeric(as.character(BID)))
  ) %>%
  # Remove duplicate 
  distinct(PERMNO, date, .keep_all = TRUE) %>% 
  arrange(PERMNO, date) %>% 
  group_by(PERMNO) %>% 
  # Compute relevant variables
  mutate(
    mean_ret = rollmean(RET, k = 30, fill = NA, align = "right", na.rm = TRUE),
    std_ret = rollapply(RET, width = 30, FUN = sd, fill = NA, align = "right", na.rm = TRUE),
    volatility = std_ret * sqrt(trading_days_pre),
    dollar_vol = PRC * VOL,
    market_cap = SHROUT * PRC * 1000,
    bid_ask_spread = (ASK - BID) / ((ASK + BID) / 2)
    ) %>% 
  ungroup() %>% 
  select(PERMNO, date, TICKER, bid_ask_spread, volatility, dollar_vol, market_cap)

# Clean data (post-period)
post_treatment_df1 <- post_treatment %>%
  mutate(
    PERMNO = as.character(PERMNO),
    PRC = abs(PRC),
    # Handle missing values
    PRC = ifelse(PRC == 0.0, NA, PRC),
    VOL = ifelse(VOL == -99, NA, VOL),
    RET = case_when(
      RET %in% c(-44, -55, -66, -77, -88, -99) ~ NA,  
      RET %in% c(".A", ".B", ".C", ".D") ~ NA,        
      TRUE ~ as.numeric(as.character(RET))
    ),
    ASK = ifelse(ASK - BID < 0, NA, as.numeric(as.character(ASK))),
    BID = ifelse(ASK - BID < 0, NA, as.numeric(as.character(BID)))
  ) %>%
  # Remove duplicate
  distinct(PERMNO, date, .keep_all = TRUE) %>% 
  group_by(PERMNO) %>% 
  # Compute relevant variables
  mutate(
    mean_ret = rollmean(RET, k = 30, fill = NA, align = "right", na.rm = TRUE),
    std_ret = rollapply(RET, width = 30, FUN = sd, fill = NA, align = "right", na.rm = TRUE),
    volatility = std_ret * sqrt(trading_days_post),
    dollar_vol = PRC * VOL,
    market_cap = SHROUT * PRC * 1000,
    bid_ask_spread = (ASK - BID) / ((ASK + BID) / 2)
    ) %>% 
  ungroup() %>% 
  select(PERMNO, date, TICKER, bid_ask_spread, volatility, dollar_vol, market_cap)

# Combined dataset and change the column name (it will look clearer in the regression coefficients)
task6_panel <- pre_treatment_df1 %>% bind_rows(post_treatment_df1) %>% 
  left_join(tick_pilot %>% 
              select(Ticker_Symbol, Tick_Size_Pilot_Program_Group), by = c("TICKER" = "Ticker_Symbol")) %>% 
  rename(Test_group_ = Tick_Size_Pilot_Program_Group) 

# Remove missing values, create period binary variable, and standardise variables to make sure they are on the same scale
task6_panel_cleaned <- task6_panel %>% 
  filter(!is.na(bid_ask_spread) & !is.na(volatility))%>% 
  mutate(
    d_after = ifelse(date < "2016-10-01", 0, 1),
    scaled_spread = scale(bid_ask_spread),
    scaled_volatility = scale(volatility),
    scaled_dollar_vol = scale(dollar_vol),
    scaled_market_cap = scale(market_cap),
    Test_group_ = factor(Test_group_)
  ) 

# Inspect number of stocks for each group
print(task6_panel_cleaned %>%
  group_by(Test_group_) %>%
  summarise(Ticker_per_group = n_distinct(TICKER)))
## # A tibble: 4 × 2
##   Test_group_ Ticker_per_group
##   <fct>                  <int>
## 1 C                        965
## 2 G1                       330
## 3 G2                       326
## 4 G3                       315

Descriptive Statistics

# Observe the average value for each group
print(task6_panel %>% 
  group_by(Test_group_) %>% 
  summarise(
    mean_spread = mean(bid_ask_spread, na.rm = TRUE), 
    sd_spread = sd(bid_ask_spread, na.rm = TRUE)
  ))
## # A tibble: 4 × 3
##   Test_group_ mean_spread sd_spread
##   <chr>             <dbl>     <dbl>
## 1 C               0.00621    0.0201
## 2 G1              0.00698    0.0171
## 3 G2              0.00823    0.0226
## 4 G3              0.00745    0.0197
# Visualise the average bid-ask spread across the entire period
task6_time_series <- ggplot(task6_panel, 
                            aes(x = date, 
                                y = bid_ask_spread, 
                                color = as.factor(Test_group_))) +
  
  geom_line(stat = "summary", 
            fun = "mean", 
            show.legend = FALSE) +
  
  labs(title = "Overall Trends in Bid-Ask Spread", 
       subtitle = "(Program Began on October 2016)",
       x = "Date", 
       y = "Average Relative Bid-Ask Spread") + 
  
  scale_x_date(breaks = seq(as.Date("2015-11-01"), 
                            as.Date("2019-11-01"), 
                            by = "6 months"),
               labels = date_format("%Y-%m-%d")) + 
  
  # Increase readiness
  facet_wrap(~ as.factor(Test_group_), scales = "free",
             labeller = as_labeller(c("C" = "Control Group",
                                      "G1" = "Test Group One",
                                      "G2" = "Test Group Two",
                                      "G3" = "Test Group Three"))) +
  
  # Add a line to underlying the effective date
  geom_vline(xintercept = as.Date("2016-10-01"), linetype = "dashed", color = "black") + 
  
  theme(axis.text.x = element_text(angle = 55, hjust=1))


print(task6_time_series)

# Save the plot for report
save_plot(task6_time_series, "task6 time series plot")

DiD regressions

# Run regression with fixed effect
DiD_model_2 <- plm(bid_ask_spread ~ Test_group_ * d_after,
                   data = task6_panel_cleaned,
                   index = c("date", "PERMNO"),
                   model = "within")
summary(DiD_model_2)
## Oneway (individual) effect Within Model
## 
## Call:
## plm(formula = bid_ask_spread ~ Test_group_ * d_after, data = task6_panel_cleaned, 
##     model = "within", index = c("date", "PERMNO"))
## 
## Unbalanced Panel: n = 697, T = 1645-1923, N = 1264428
## 
## Residuals:
##       Min.    1st Qu.     Median    3rd Qu.       Max. 
## -0.0108503 -0.0058366 -0.0047848 -0.0016908  1.9891194 
## 
## Coefficients:
##                          Estimate  Std. Error t-value  Pr(>|t|)    
## Test_group_G1         -5.9440e-04  8.4094e-05 -7.0682 1.570e-12 ***
## Test_group_G2         -3.2642e-05  8.3894e-05 -0.3891    0.6972    
## Test_group_G3         -4.8288e-04  8.5592e-05 -5.6417 1.685e-08 ***
## Test_group_G1:d_after  2.2008e-03  1.0328e-04 21.3096 < 2.2e-16 ***
## Test_group_G2:d_after  3.2146e-03  1.0324e-04 31.1371 < 2.2e-16 ***
## Test_group_G3:d_after  2.8112e-03  1.0511e-04 26.7463 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    484.64
## Residual Sum of Squares: 483.26
## R-Squared:      0.0028472
## Adj. R-Squared: 0.0022933
## F-statistic: 601.398 on 6 and 1263725 DF, p-value: < 2.22e-16
# Observe the importance of fixed effect
DiD_model_2_random <- plm(bid_ask_spread ~ Test_group_ * d_after,
                   data = task6_panel_cleaned,
                   index = c("date", "TICKER"),
                   model = "random")

summary(DiD_model_2_random)
## Oneway (individual) effect Random Effect Model 
##    (Swamy-Arora's transformation)
## 
## Call:
## plm(formula = bid_ask_spread ~ Test_group_ * d_after, data = task6_panel_cleaned, 
##     model = "random", index = c("date", "TICKER"))
## 
## Unbalanced Panel: n = 697, T = 1645-1923, N = 1264428
## 
## Effects:
##                     var   std.dev share
## idiosyncratic 3.824e-04 1.956e-02     1
## individual    8.410e-08 2.900e-04     0
## theta:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1431  0.1487  0.1570  0.1548  0.1606  0.1617 
## 
## Residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.00894 -0.00589 -0.00495  0.00000 -0.00173  1.99038 
## 
## Coefficients:
##                          Estimate  Std. Error  z-value  Pr(>|z|)    
## (Intercept)            7.0332e-03  4.6635e-05 150.8119 < 2.2e-16 ***
## Test_group_G1         -5.9537e-04  8.4109e-05  -7.0785 1.457e-12 ***
## Test_group_G2         -3.2452e-05  8.3908e-05  -0.3868    0.6989    
## Test_group_G3         -4.8589e-04  8.5607e-05  -5.6759 1.380e-08 ***
## d_after               -1.4000e-03  5.7202e-05 -24.4750 < 2.2e-16 ***
## Test_group_G1:d_after  2.2015e-03  1.0330e-04  21.3120 < 2.2e-16 ***
## Test_group_G2:d_after  3.2150e-03  1.0326e-04  31.1352 < 2.2e-16 ***
## Test_group_G3:d_after  2.8152e-03  1.0512e-04  26.7803 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    485.07
## Residual Sum of Squares: 483.69
## R-Squared:      0.0028302
## Adj. R-Squared: 0.0028247
## Chisq: 3609.49 on 7 DF, p-value: < 2.22e-16
# Similar, but better to use the fixed effect version

Other factors that may influence bid-ask spreads

# Interact with volatility
DiD_model_3 <- plm(scaled_spread ~ Test_group_ * d_after + Test_group_ * d_after * scaled_volatility,
                   data = task6_panel_cleaned,
                   index = c("date", "TICKER"),
                   model = "within")

summary(DiD_model_3)
## Oneway (individual) effect Within Model
## 
## Call:
## plm(formula = scaled_spread ~ Test_group_ * d_after + Test_group_ * 
##     d_after * scaled_volatility, data = task6_panel_cleaned, 
##     model = "within", index = c("date", "TICKER"))
## 
## Unbalanced Panel: n = 697, T = 1645-1923, N = 1264428
## 
## Residuals:
##       Min.    1st Qu.     Median    3rd Qu.       Max. 
## -11.300807  -0.276644  -0.202029  -0.067455 101.056980 
## 
## Coefficients:
##                                           Estimate Std. Error  t-value
## Test_group_G1                           -0.0129976  0.0043730  -2.9722
## Test_group_G2                            0.0033957  0.0043194   0.7861
## Test_group_G3                            0.0272294  0.0044381   6.1354
## scaled_volatility                        0.1910490  0.0021761  87.7937
## Test_group_G1:d_after                    0.0904628  0.0053047  17.0533
## Test_group_G2:d_after                    0.1292625  0.0052730  24.5141
## Test_group_G3:d_after                    0.0847384  0.0053896  15.7226
## Test_group_G1:scaled_volatility          0.0449286  0.0059994   7.4888
## Test_group_G2:scaled_volatility          0.0251111  0.0061947   4.0536
## Test_group_G3:scaled_volatility          0.2047588  0.0057873  35.3805
## d_after:scaled_volatility               -0.0658201  0.0026826 -24.5358
## Test_group_G1:d_after:scaled_volatility  0.0540895  0.0066556   8.1270
## Test_group_G2:d_after:scaled_volatility  0.0887940  0.0066382  13.3763
## Test_group_G3:d_after:scaled_volatility -0.0740496  0.0067536 -10.9644
##                                          Pr(>|t|)    
## Test_group_G1                            0.002956 ** 
## Test_group_G2                            0.431784    
## Test_group_G3                           8.498e-10 ***
## scaled_volatility                       < 2.2e-16 ***
## Test_group_G1:d_after                   < 2.2e-16 ***
## Test_group_G2:d_after                   < 2.2e-16 ***
## Test_group_G3:d_after                   < 2.2e-16 ***
## Test_group_G1:scaled_volatility         6.953e-14 ***
## Test_group_G2:scaled_volatility         5.044e-05 ***
## Test_group_G3:scaled_volatility         < 2.2e-16 ***
## d_after:scaled_volatility               < 2.2e-16 ***
## Test_group_G1:d_after:scaled_volatility 4.405e-16 ***
## Test_group_G2:d_after:scaled_volatility < 2.2e-16 ***
## Test_group_G3:d_after:scaled_volatility < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    1262800
## Residual Sum of Squares: 1206500
## R-Squared:      0.044616
## Adj. R-Squared: 0.04408
## F-statistic: 4215.4 on 14 and 1263717 DF, p-value: < 2.22e-16
# Interact with dollar volume
DiD_model_4 <- plm(scaled_spread ~ Test_group_ * d_after + Test_group_ * d_after * scaled_dollar_vol,
                   data = task6_panel_cleaned,
                   index = c("date", "TICKER"),
                   model = "within")

summary(DiD_model_4)
## Oneway (individual) effect Within Model
## 
## Call:
## plm(formula = scaled_spread ~ Test_group_ * d_after + Test_group_ * 
##     d_after * scaled_dollar_vol, data = task6_panel_cleaned, 
##     model = "within", index = c("date", "TICKER"))
## 
## Unbalanced Panel: n = 697, T = 1645-1923, N = 1264428
## 
## Residuals:
##       Min.    1st Qu.     Median    3rd Qu.       Max. 
##  -0.567811  -0.294686  -0.236157  -0.076207 101.490316 
## 
## Coefficients:
##                                            Estimate  Std. Error  t-value
## Test_group_G1                           -0.02784746  0.00430939  -6.4620
## Test_group_G2                            0.00013458  0.00430455   0.0313
## Test_group_G3                           -0.01329252  0.00437034  -3.0415
## scaled_dollar_vol                       -0.23570473  0.00351861 -66.9881
## Test_group_G1:d_after                    0.11136970  0.00527469  21.1140
## Test_group_G2:d_after                    0.16319129  0.00527620  30.9297
## Test_group_G3:d_after                    0.13401777  0.00535500  25.0267
## Test_group_G1:scaled_dollar_vol         -0.01669069  0.00725472  -2.3007
## Test_group_G2:scaled_dollar_vol          0.01435802  0.00677809   2.1183
## Test_group_G3:scaled_dollar_vol          0.08083983  0.00613116  13.1851
## d_after:scaled_dollar_vol                0.16087000  0.00374039  43.0089
## Test_group_G1:d_after:scaled_dollar_vol -0.01403663  0.00778018  -1.8042
## Test_group_G2:d_after:scaled_dollar_vol -0.08149131  0.00737161 -11.0547
## Test_group_G3:d_after:scaled_dollar_vol -0.11238667  0.00669849 -16.7779
##                                          Pr(>|t|)    
## Test_group_G1                           1.033e-10 ***
## Test_group_G2                            0.975058    
## Test_group_G3                            0.002354 ** 
## scaled_dollar_vol                       < 2.2e-16 ***
## Test_group_G1:d_after                   < 2.2e-16 ***
## Test_group_G2:d_after                   < 2.2e-16 ***
## Test_group_G3:d_after                   < 2.2e-16 ***
## Test_group_G1:scaled_dollar_vol          0.021411 *  
## Test_group_G2:scaled_dollar_vol          0.034150 *  
## Test_group_G3:scaled_dollar_vol         < 2.2e-16 ***
## d_after:scaled_dollar_vol               < 2.2e-16 ***
## Test_group_G1:d_after:scaled_dollar_vol  0.071208 .  
## Test_group_G2:d_after:scaled_dollar_vol < 2.2e-16 ***
## Test_group_G3:d_after:scaled_dollar_vol < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    1262800
## Residual Sum of Squares: 1241000
## R-Squared:      0.017273
## Adj. R-Squared: 0.016721
## F-statistic: 1586.53 on 14 and 1263717 DF, p-value: < 2.22e-16
# Interact with market capitalisation
DiD_model_5 <- plm(scaled_spread ~ Test_group_ * d_after + Test_group_ * d_after * scaled_market_cap,
                   data = task6_panel_cleaned,
                   index = c("date", "TICKER"),
                   model = "within")

summary(DiD_model_5)
## Oneway (individual) effect Within Model
## 
## Call:
## plm(formula = scaled_spread ~ Test_group_ * d_after + Test_group_ * 
##     d_after * scaled_market_cap, data = task6_panel_cleaned, 
##     model = "within", index = c("date", "TICKER"))
## 
## Unbalanced Panel: n = 697, T = 1645-1923, N = 1264428
## 
## Residuals:
##       Min.    1st Qu.     Median    3rd Qu.       Max. 
##  -0.757122  -0.326744  -0.212696   0.028823 101.366819 
## 
## Coefficients:
##                                           Estimate Std. Error   t-value
## Test_group_G1                           -0.0118479  0.0042641   -2.7785
## Test_group_G2                           -0.0200371  0.0043489   -4.6074
## Test_group_G3                           -0.0139776  0.0043616   -3.2047
## scaled_market_cap                       -0.3590842  0.0029151 -123.1815
## Test_group_G1:d_after                    0.1031756  0.0052069   19.8152
## Test_group_G2:d_after                    0.1810066  0.0052766   34.3036
## Test_group_G3:d_after                    0.1426427  0.0053156   26.8348
## Test_group_G1:scaled_market_cap          0.0278860  0.0056555    4.9308
## Test_group_G2:scaled_market_cap         -0.0322744  0.0060101   -5.3700
## Test_group_G3:scaled_market_cap          0.0156337  0.0059385    2.6326
## d_after:scaled_market_cap                0.1766160  0.0032151   54.9327
## Test_group_G1:d_after:scaled_market_cap -0.0585878  0.0062825   -9.3256
## Test_group_G2:d_after:scaled_market_cap -0.0425621  0.0066070   -6.4419
## Test_group_G3:d_after:scaled_market_cap -0.0529140  0.0065102   -8.1278
##                                          Pr(>|t|)    
## Test_group_G1                            0.005461 ** 
## Test_group_G2                           4.077e-06 ***
## Test_group_G3                            0.001352 ** 
## scaled_market_cap                       < 2.2e-16 ***
## Test_group_G1:d_after                   < 2.2e-16 ***
## Test_group_G2:d_after                   < 2.2e-16 ***
## Test_group_G3:d_after                   < 2.2e-16 ***
## Test_group_G1:scaled_market_cap         8.192e-07 ***
## Test_group_G2:scaled_market_cap         7.873e-08 ***
## Test_group_G3:scaled_market_cap          0.008474 ** 
## d_after:scaled_market_cap               < 2.2e-16 ***
## Test_group_G1:d_after:scaled_market_cap < 2.2e-16 ***
## Test_group_G2:d_after:scaled_market_cap 1.180e-10 ***
## Test_group_G3:d_after:scaled_market_cap 4.375e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    1262800
## Residual Sum of Squares: 1187700
## R-Squared:      0.059535
## Adj. R-Squared: 0.059007
## F-statistic: 5714.19 on 14 and 1263717 DF, p-value: < 2.22e-16